feat(agent): optimize enrich_hf_metadata with README cleaning, Ollama native API, and expanded fields

- Add clean_readme_for_llm() to strip noise from README before LLM injection - Keep widget section text (valuable tag signal) and unmarked code blocks (trigger words) - Preserve standalone image alt text instead of removing entirely - Switch Ollama to native /api/chat with think:false to fix empty content on thinking models - Extract Sample Gallery table images and deduplicate with widget images - Only strip code blocks with explicit language tags (bash) - Add notes and usage_tips fields to SKILL.md output format and post-processor - Clean up dead code, fix regex edge cases, remove double type annotation
2026-07-05 17:01:16 -03:00 · 2026-07-04 08:01:50 +08:00
parent b22f09bd1d
commit a1fd4e150b
6 changed files with 937 additions and 30 deletions
--- a/py/services/agent/agent_service.py
+++ b/py/services/agent/agent_service.py
@@ -28,6 +28,7 @@ from ..llm_service import LLMService
 from ..websocket_manager import ws_manager
 from .post_processor import PostProcessor
 from .skill_registry import SkillRegistry
 from .skills.enrich_hf_metadata.md_to_html import clean_readme_for_llm
 logger = logging.getLogger(__name__)
@@ -368,7 +369,8 @@ class AgentService:
        context["repo"] = repo or ""
        if repo:
            readme = await self._fetch_readme(repo)
-            context["readme_content"] = readme[:8000] if readme else "(README not available)"
+            cleaned = clean_readme_for_llm(readme) if readme else ""
            context["readme_content"] = cleaned if cleaned else "(README not available)"
            context["readme_content_full"] = readme or ""
        try:
--- a/py/services/agent/post_processor.py
+++ b/py/services/agent/post_processor.py
@@ -10,6 +10,7 @@ refresh cache).  All actual I/O is delegated to :mod:`~py.agent_cli`.
 from __future__ import annotations
 import json
 import logging
 import os
 from datetime import datetime, timezone
@@ -79,6 +80,7 @@ class PostProcessor:
        from .skills.enrich_hf_metadata.md_to_html import (
            convert_readme_to_html,
            extract_gallery_images,
            extract_gallery_table_images,
            extract_repo_from_hf_url,
        )
@@ -127,23 +129,38 @@ class PostProcessor:
            desc_civitai["description"] = short_desc
            updates["civitai"] = desc_civitai
-        # gallery images → civitai.images (from YAML frontmatter widget entries)
+        # gallery images → civitai.images (from YAML frontmatter widget entries
        # and Sample Gallery markdown tables in the README body)
        gallery_images: List[Dict[str, Any]] = []
        if readme_content and is_hf_model:
            hf_url = metadata.get("hf_url", "") or ""
            repo = extract_repo_from_hf_url(hf_url)
            if repo:
                rec_w = llm_output.get("recommended_width") or 0
                rec_h = llm_output.get("recommended_height") or 0
                # 1. Widget images (YAML frontmatter)
                gallery = extract_gallery_images(
                    readme_content, repo,
                    default_width=rec_w, default_height=rec_h,
                )
-                if gallery:
+
                # 2. Sample Gallery table images (markdown body), deduplicated
                existing_urls = {img["url"] for img in gallery if img.get("url")}
                table_images = extract_gallery_table_images(
                    readme_content, repo,
                    existing_urls=existing_urls,
                    default_width=rec_w, default_height=rec_h,
                )
                all_images = gallery + table_images
                if all_images:
                    gallery_images = all_images
                    current_civitai = metadata.get("civitai") or {}
                    gallery_civitai = dict(current_civitai)
                    if "civitai" in updates and isinstance(updates["civitai"], dict):
                        gallery_civitai.update(updates["civitai"])
-                    gallery_civitai["images"] = gallery
+                    gallery_civitai["images"] = all_images
                    updates["civitai"] = gallery_civitai
        # tags
@@ -159,6 +176,11 @@ class PostProcessor:
        updates["llm_enriched_at"] = datetime.now(timezone.utc).isoformat()
        preview_remote_url = (llm_output.get("preview_url") or "").strip()
        # Fallback: if the LLM couldn't find a preview image in the cleaned
        # README, use the first gallery image extracted from the YAML widget
        # section.
        if not preview_remote_url and gallery_images:
            preview_remote_url = gallery_images[0].get("url", "")
        current_preview = metadata.get("preview_url") or ""
        if preview_remote_url and not (current_preview and os.path.exists(current_preview)):
            local_path = await download_preview(model_path, preview_remote_url)
@@ -166,6 +188,22 @@ class PostProcessor:
                preview_downloaded = True
                updates["preview_url"] = local_path
        # notes — plain-text summary of usage info from the LLM
        new_notes = (llm_output.get("notes") or "").strip()
        if new_notes:
            updates["notes"] = new_notes
        # usage_tips — JSON string (e.g. {"strength_min":0.85,"strength_max":1.4})
        raw_tips = (llm_output.get("usage_tips") or "").strip()
        if raw_tips and raw_tips != "{}":
            try:
                json.loads(raw_tips)
                updates["usage_tips"] = raw_tips
            except (json.JSONDecodeError, TypeError):
                logger.warning(
                    "LLM returned invalid usage_tips JSON: %s", raw_tips[:200]
                )
        if updates:
            updated_fields = await apply_metadata_updates(model_path, updates)
--- a/py/services/agent/skills/enrich_hf_metadata/SKILL.md
+++ b/py/services/agent/skills/enrich_hf_metadata/SKILL.md
@@ -84,6 +84,25 @@ The recommended image generation resolution for this model, in pixels. Look for
 ### preview_url
 The URL of the most suitable preview image from the README. Look for image tags (e.g. `![alt](url)`) and the YAML frontmatter `widget:` section (which often has `output.url` fields). Choose the first image that appears to be a generation example (not a logo or diagram). Construct the absolute URL as `https://huggingface.co/{{repo}}/resolve/main/{filename}`. If no suitable image is found, return an empty string.
 ### notes
 A plain-text summary of the model card's key practical usage information. Combine trigger words, style modifiers, recommended parameters (steps, CFG, resolution, sampler), and any setup tips into a readable paragraph. Return empty string if the README has no useful usage info.
 ### usage_tips
 A JSON string with structured usage recommendations. Extract from the README any explicit ranges or recommended values (e.g. "Set LoRA strength: **0.85 - 1.4**", "CLIP strength: 0.5"). Possible fields (include only those you can determine):
 ```json
 {
  "strength_min": 0.85,
  "strength_max": 1.4,
  "strength_range": "0.85-1.4",
  "strength": 0.6,
  "clip_strength": 0.5,
  "clip_skip": 2
 }
 ```
 Return the JSON string (e.g. `'{"strength_min":0.85,"strength_max":1.4}'`). Return `"{}"` if nothing useful is found.
 ### confidence
 Your confidence level in the extracted data:
 - "high" — most fields were explicitly stated in the README
@@ -104,6 +123,8 @@ Return ONLY a JSON object with exactly these fields (no markdown fences, no extr
  "recommended_width": 768,
  "recommended_height": 1024,
  "preview_url": "<image URL or empty string>",
  "notes": "<plain-text usage summary or empty string>",
  "usage_tips": "<JSON string like '{\"strength_min\":0.85,\"strength_max\":1.4}' or '{}'>",
  "confidence": "<high|medium|low>"
 }
 ```
--- a/py/services/agent/skills/enrich_hf_metadata/md_to_html.py
+++ b/py/services/agent/skills/enrich_hf_metadata/md_to_html.py
@@ -1,8 +1,13 @@
-"""Inline markdown-to-HTML converter for HF README content.
+"""Inline markdown-to-HTML converter and LLM-prompt cleaner for HF README content.
 No external dependencies.  Strips YAML frontmatter, ``<Gallery />`` sections,
 badge images, and HTML comments before rendering.  Only used by the
 ``enrich_hf_metadata`` skill.
 Also provides :func:`clean_readme_for_llm` which pre-processes the raw README
 before it is injected into the LLM prompt, removing content that has zero value
 for metadata extraction (widget sections, code blocks, training tables,
 boilerplate, massive lists, etc.).
 """
 from __future__ import annotations
@@ -118,6 +123,88 @@ def extract_gallery_images(
    return images
 def extract_gallery_table_images(
    markdown_text: str,
    repo: str,
    existing_urls: set | None = None,
    default_width: int = 512,
    default_height: int = 512,
 ) -> list[dict]:
    """Extract images from ``| Preview | Prompt |`` markdown gallery tables.
    Many HF READMEs include a sample-gallery table in the body (outside
    the YAML frontmatter) that shows generation examples with their
    prompts.  This function parses those tables and merges results with
    the widget-sourced images from :func:`extract_gallery_images`.
    Returns a list of dicts in the same ``civitai.images`` format as
    :func:`extract_gallery_images`.  Already-seen URLs (from *existing_urls*)
    are skipped.
    """
    if not markdown_text or not repo:
        return []
    base_url = f"https://huggingface.co/{repo}/resolve/main"
    images: list[dict] = []
    seen_urls: set = set(existing_urls) if existing_urls else set()
    lines = markdown_text.split("\n")
    n = len(lines)
    i = 0
    while i < n:
        line = lines[i]
        if "|" not in line or i + 1 >= n:
            i += 1
            continue
        # Check for table separator row
        if not re.match(r"^\|[\s:-]+\|", lines[i + 1]):
            i += 1
            continue
        header_lower = line.strip().lower()
        first_cell = header_lower.strip("|").split("|")[0].strip() if "|" in header_lower else ""
        is_gallery = any(kw in first_cell for kw in ("preview", "sample", "gallery", "image", "thumbnail"))
        if not is_gallery:
            i += 1
            continue
        # Skip header + separator
        i += 2
        while i < n and "|" in lines[i]:
            cells = [c.strip() for c in lines[i].strip().strip("|").split("|")]
            if len(cells) >= 2:
                first = cells[0]
                prompt = cells[1]
                url_match = re.search(r"!\[([^\]]*)\]\(([^)]+)\)", first)
                if url_match:
                    raw_path = url_match.group(2)
                    if raw_path.startswith("http"):
                        url = raw_path
                    else:
                        # Normalise: remove leading / and ./ prefixes
                        clean = raw_path.lstrip("./").lstrip("/")
                        url = f"{base_url}/{clean}"
                    if url not in seen_urls:
                        seen_urls.add(url)
                        images.append({
                            "url": url,
                            "type": "image",
                            "nsfwLevel": 0,
                            "width": default_width,
                            "height": default_height,
                            "meta": {"prompt": prompt, "negativePrompt": ""},
                            "hasMeta": bool(prompt),
                            "hasPositivePrompt": bool(prompt),
                        })
            i += 1
        continue
    return images
 def _extract_frontmatter(text: str) -> str:
    """Return the YAML frontmatter content (without the ``---`` delimiters).
@@ -145,7 +232,260 @@ def convert_readme_to_html(markdown_text: str | None) -> str:
 # ---------------------------------------------------------------------------
-# Pre-processing: strip unwanted sections
+# README cleaning for LLM prompt injection
 # ---------------------------------------------------------------------------
 #: Section headers that signal boilerplate content with zero metadata value.
 _BOILERPLATE_HEADERS: tuple[str, ...] = (
    "download model",
    "license",
    "citation",
    "links",
    "disclaimer",
    "architecture notes",
    "training details",
    "dataset",
    "provenance",
 )
 #: Table header keywords that identify training-parameter tables.
 _TRAINING_PARAM_KEYWORDS: tuple[str, ...] = (
    "lr scheduler",
    "optimizer",
    "network dim",
    "network alpha",
    "noise offset",
    "multires noise",
    "repeat",
    "epoch",
    "batch size",
    "gradient accumulation",
    "learning rate",
    "rslora",
    "dtype",
 )
 #: Maximum chars before a single-line comma list is considered massive.
 _MASSIVE_LIST_LINE_MIN_LEN = 150
 #: Minimum consecutive enumeration lines to trigger massive-list stripping.
 _MASSIVE_LIST_THRESHOLD = 8
 def clean_readme_for_llm(markdown_text: str | None, max_length: int = 6000) -> str:
    """Clean a HF README for injection into an LLM metadata-extraction prompt.
    Removes content that carries no signal for inferring base model,
    trigger words, short description, tags, or a preview image URL:
    * ``widget:`` YAML block (example prompts + output URLs)
    * ``<Gallery />`` tags and wrappers
    * Fenced code blocks (Python / bash / bibtex / yaml)
    * Standalone ``![...](...)`` image lines and ``<img>`` tags
    * Training-parameter tables
    * Boilerplate sections (Download / License / Citation / …)
    * Massive enumeration lists (e.g. 3000+ celebrity names)
    The post-processor still receives the **full** raw README via
    ``readme_content_full``, so nothing is lost for HTML conversion or
    gallery-image extraction.
    Args:
        markdown_text: Raw README.md content from HuggingFace.
        max_length: Hard ceiling on output length (default 6 000 chars).
    Returns:
        Cleaned markdown, truncated to *max_length*.
    """
    if not markdown_text:
        return ""
    text = markdown_text
    # Order matters — broader strips first, then finer ones.
    text = _strip_gallery(text)
    text = _strip_fenced_code_blocks(text)
    text = _strip_standalone_images(text)
    text = _strip_training_tables(text)
    text = _strip_boilerplate_sections(text)
    text = _strip_massive_lists(text)
    text = _strip_badge_images(text)
    text = _strip_html_comments(text)
    text = _compress_blank_lines(text)
    if len(text) > max_length:
        text = text[:max_length]
    return text.strip()
 def _strip_fenced_code_blocks(text: str) -> str:
    """Strip fenced code blocks that have an explicit programming-language tag.
    Blocks without a language tag (just `` ``` ``) are preserved — they
    often contain trigger words, example prompts, or config snippets
    rather than actual runnable code.
    """
    # Match opening ``` immediately followed by a word character (the language
    # tag), then any content, then closing ```.  Plain ``` at the start of a
    # line is left intact.  A leading \n is optional (handles blocks at the
    # start of the text).
    return re.sub(
        r"(?:\n|^)```[a-zA-Z_][a-zA-Z0-9_]*\s*\n.*?\n```",
        "",
        text,
        flags=re.DOTALL,
    )
 def _strip_standalone_images(text: str) -> str:
    """Strip image embeds that occupy their own line.
    Preserves the alt text from markdown images (``![alt](url)`` → ``alt``)
    since it often describes what the model generates, which is useful signal
    for tag/description extraction.
    """
    # Markdown: ``![alt](url)`` on its own line → keep alt text
    text = re.sub(
        r"^\s*!\[([^\]]*)\]\([^)]+\)\s*$",
        r"\1",
        text,
        flags=re.MULTILINE,
    )
    # HTML: ``<img src="..." ...>`` on its own line → remove entirely
    text = re.sub(
        r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
        "",
        text,
        flags=re.MULTILINE | re.IGNORECASE,
    )
    return text
 def _strip_training_tables(text: str) -> str:
    """Strip markdown tables whose header row mentions training parameters.
    Checks the header row (first line of a detected table) against
    ``_TRAINING_PARAM_KEYWORDS``.  Non-training tables (e.g. "Best
    Dimensions") are preserved.
    """
    lines = text.split("\n")
    out: list[str] = []
    i = 0
    n = len(lines)
    while i < n:
        line = lines[i]
        if "|" in line and i + 1 < n and re.match(r"^\|[\s:-]+\|", lines[i + 1]):
            table_lines = [line]
            i += 1
            while i < n and "|" in lines[i]:
                table_lines.append(lines[i])
                i += 1
            # Check header + first data row for training keywords
            header_and_first = (line + "\n" + (table_lines[2] if len(table_lines) > 2 else "")).lower()
            if any(kw in header_and_first for kw in _TRAINING_PARAM_KEYWORDS):
                continue
            out.extend(table_lines)
        else:
            out.append(line)
            i += 1
    return "\n".join(out)
 def _strip_boilerplate_sections(text: str) -> str:
    """Strip sections whose headings match known boilerplate patterns.
    When a heading (``## Download model``, ``## License``, etc.) is
    detected, the heading and all content until the next heading of
    equal-or-higher level is removed.
    """
    lines = text.split("\n")
    out: list[str] = []
    i = 0
    n = len(lines)
    skip_until_level: int | None = None
    while i < n:
        line = lines[i]
        h_match = re.match(r"^(#{1,4})\s+(.+?)\s*#*$", line)
        if h_match:
            level = len(h_match.group(1))
            title = h_match.group(2).strip().lower()
            is_boilerplate = any(
                title == kw or title.startswith(kw + " ") or title.startswith(kw + ":")
                for kw in _BOILERPLATE_HEADERS
            )
            if is_boilerplate:
                skip_until_level = level
                i += 1
                continue
            if skip_until_level is not None and level <= skip_until_level:
                skip_until_level = None
        if skip_until_level is None:
            out.append(line)
        i += 1
    return "\n".join(out)
 def _strip_massive_lists(text: str) -> str:
    """Strip blocks of 8+ consecutive enumeration-style lines.
    Targets long comma-separated name lists (e.g. the 3000+ celebrity
    names in some Z-Image READMEs) and dense bullet enumerations.
    """
    lines = text.split("\n")
    out: list[str] = []
    i = 0
    n = len(lines)
    while i < n:
        stripped = lines[i].strip()
        # A "list-like" line ends with comma or is a bullet with commas
        is_list_like = bool(stripped) and (
            stripped.endswith(",")
            or len(stripped) >= _MASSIVE_LIST_LINE_MIN_LEN
            or (bool(re.match(r"^[-*+]\s", stripped)) and "," in stripped)
        )
        if is_list_like:
            count = 1
            j = i + 1
            while j < n:
                s = lines[j].strip()
                if not s:
                    j += 1
                    continue
                if s.endswith(",") or (bool(re.match(r"^[-*+]\s", s)) and "," in s):
                    count += 1
                    j += 1
                else:
                    break
            if count >= _MASSIVE_LIST_THRESHOLD:
                i = j
                continue
        out.append(lines[i])
        i += 1
    return "\n".join(out)
 def _compress_blank_lines(text: str) -> str:
    """Collapse runs of 3+ blank lines down to 2."""
    return re.sub(r"\n{3,}", "\n\n", text)
 # ---------------------------------------------------------------------------
 # Pre-processing: strip unwanted sections (HTML conversion helpers)
 # ---------------------------------------------------------------------------
--- a/py/services/llm_service.py
+++ b/py/services/llm_service.py
@@ -333,18 +333,53 @@ class LLMService:
        cfg = self._ensure_configured()
        api_base = self._resolve_api_base(cfg["provider"], cfg["api_base"])
        url = f"{api_base}/chat/completions"
        model_name = model or cfg["model"]
-        payload: Dict[str, Any] = {
+        is_ollama = cfg["provider"] == "ollama"
-            "model": model_name,
+
-            "messages": messages,
+        if is_ollama:
-            "temperature": temperature,
+            # Use Ollama's native /api/chat endpoint which does NOT expose
-        }
+            # a separate reasoning/thinking field (the model's full output
-        if response_format is not None:
+            # lands directly in message.content).  The OpenAI-compatible
-            payload["response_format"] = response_format
+            # endpoint splits thinking into the "reasoning" field, making
-        if max_tokens is not None:
+            # content empty when thinking consumes all available tokens.
-            payload["max_tokens"] = max_tokens
+            base = api_base.rstrip("/")
            if base.endswith("/v1"):
                base = base[:-3]
            url = f"{base}/api/chat"
        else:
            url = f"{api_base}/chat/completions"
        payload: Dict[str, Any]
        if is_ollama:
            payload = {
                "model": model_name,
                "messages": messages,
                "stream": False,
                # Suppress separate thinking trace — thinking still happens
                # internally (accuracy preserved) but output goes directly to
                # message.content instead of being split across content +
                # thinking.  Without this the model can exhaust num_predict
                # on thinking alone and leave content empty.
                "think": False,
                "options": {
                    "temperature": temperature,
                },
            }
            if response_format is not None:
                payload["format"] = "json"
            if max_tokens is not None:
                payload["options"]["num_predict"] = max_tokens
        else:
            payload = {
                "model": model_name,
                "messages": messages,
                "temperature": temperature,
            }
            if response_format is not None:
                payload["response_format"] = response_format
            if max_tokens is not None:
                payload["max_tokens"] = max_tokens
        headers = self._build_headers(cfg["api_key"])
@@ -387,8 +422,25 @@ class LLMService:
            # Parse response
            try:
-                content = data["choices"][0]["message"]["content"]
+                if is_ollama:
-                usage = data.get("usage", {})
+                    content = (data.get("message") or {}).get("content") or ""
                    usage = {"completion_tokens": data.get("eval_count", 0)}
                    finish_reason = data.get("done_reason", "")
                    if not content:
                        logger.warning(
                            "LLM returned empty content. Provider=ollama, "
                            "done_reason=%s, eval_count=%s",
                            finish_reason,
                            data.get("eval_count", 0),
                        )
                else:
                    content = data["choices"][0]["message"].get("content") or ""
                    usage = data.get("usage", {})
                    if not content:
                        logger.warning(
                            "LLM returned empty content. Full response truncated: %s",
                            json.dumps(data, ensure_ascii=False)[:1000],
                        )
                return {
                    "content": content,
                    "usage": usage,
@@ -442,13 +494,16 @@ class LLMService:
            {"role": "user", "content": user_prompt},
        ]
-        # First attempt with JSON mode
+        # First attempt with JSON mode.
        # Use a generous max_tokens so thinking-enabled models (e.g.
        # gemma4 via Ollama) have room to reason AND still emit content.
        effective_max = max_tokens or 131072
        result = await self.chat_completion(
            messages=messages,
            model=model,
            temperature=temperature,
            response_format={"type": "json_object"},
-            max_tokens=max_tokens,
+            max_tokens=effective_max,
        )
        try:
@@ -458,11 +513,15 @@ class LLMService:
                "LLM JSON parse failed on first attempt: %s. Retrying.", exc
            )
-        # Retry with explicit instruction to return valid JSON
+        # Retry WITHOUT response_format — some providers (Ollama with
        # thinking-enabled models like gemma4) may return empty content
        # when json_object mode is active.  Fall back to a textual
        # instruction instead.
        previous_content = result.get("content", "") or ""
        retry_messages = messages + [
            {
                "role": "assistant",
-                "content": result["content"],
+                "content": previous_content or "(empty response)",
            },
            {
                "role": "user",
@@ -478,14 +537,21 @@ class LLMService:
            messages=retry_messages,
            model=model,
            temperature=0.0,  # More deterministic for retry
-            response_format={"type": "json_object"},
+            max_tokens=effective_max,
            max_tokens=max_tokens,
        )
-        try:
+        content = result.get("content", "") or ""
-            return json.loads(result["content"])
+        if not content:
        except (json.JSONDecodeError, TypeError) as exc:
            raise LLMResponseError(
-                f"LLM response could not be parsed as JSON after retry: {exc}\n"
+                "LLM response could not be parsed as JSON after retry: "
-                f"Raw content: {result['content'][:500]}"
+                f"Expecting value: line 1 column 1 (char 0)\n"
-            ) from exc
+                f"Raw content: {content[:500]}"
            )
        try:
            return json.loads(content)
        except (json.JSONDecodeError, TypeError) as parse_err:
            raise LLMResponseError(
                f"LLM response could not be parsed as JSON after retry: {parse_err}\n"
                f"Raw content: {content[:500]}"
            ) from parse_err
--- a/tests/agent_cli/test_agent_cli.py
+++ b/tests/agent_cli/test_agent_cli.py
@@ -583,3 +583,443 @@ widget:
        assert len(images) == 1
        assert "two samurais doing a muay thai fight" in images[0]["meta"]["prompt"]
        assert "Textured abstract style" in images[0]["meta"]["prompt"]
 # ======================================================================
 # extract_gallery_table_images  —  Sample Gallery markdown tables
 # ======================================================================
 class TestExtractGalleryTableImages:
    _REPO = "Limbicnation/pixel-art-lora"
    _README = """## Sample Gallery
 | Preview | Prompt |
 |---------|--------|
 | ![Knight](./samples/knight.png) | pixel art sprite, a brave knight |
 | ![Dragon](./samples/dragon.png) | pixel art sprite, a fire dragon |
 """
    @staticmethod
    def _extract(md: str, repo: str = _REPO, existing: set | None = None):
        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
            extract_gallery_table_images
        return extract_gallery_table_images(md, repo, existing_urls=existing)
    def test_extracts_table_images(self):
        images = self._extract(self._README)
        assert len(images) == 2
        assert "knight.png" in images[0]["url"]
        assert images[0]["meta"]["prompt"] == "pixel art sprite, a brave knight"
        assert "dragon.png" in images[1]["url"]
    def test_skips_existing_urls(self):
        existing = {"https://huggingface.co/Limbicnation/pixel-art-lora/resolve/main/samples/knight.png"}
        images = self._extract(self._README, existing=existing)
        assert len(images) == 1
        assert "knight.png" not in images[0]["url"]
    def test_empty_readme_returns_empty(self):
        assert self._extract("") == []
    def test_no_gallery_table_returns_empty(self):
        md = "## Description\nSome text."
        assert self._extract(md) == []
    def test_non_gallery_table_skipped(self):
        md = "| Param | Value |\n|---|---|\n| Steps | 4 |"
        assert self._extract(md) == []
    def test_absolute_url_preserved(self):
        md = "| Preview | Prompt |\n|---|---|\n| ![img](https://cdn.example.com/img.png) | text |"
        images = self._extract(md, repo="user/repo")
        assert len(images) == 1
        assert images[0]["url"] == "https://cdn.example.com/img.png"
 # ======================================================================
 # clean_readme_for_llm  —  pre-process README before LLM injection
 # ======================================================================
 class TestCleanReadmeForLlm:
    @staticmethod
    def _clean(md: str, max_length: int = 6000) -> str:
        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
            clean_readme_for_llm
        return clean_readme_for_llm(md, max_length=max_length)
    # -- basic guards --------------------------------------------------------
    def test_none_returns_empty(self):
        assert self._clean(None) == ""  # type: ignore[arg-type]
    def test_empty_returns_empty(self):
        assert self._clean("") == ""
    def test_plain_text_passes_through(self):
        result = self._clean("Just some description text.")
        assert "Just some description text." in result
    # -- widget section stripping -------------------------------------------
    def test_widget_text_preserved_in_cleaned_output(self):
        """Widget section text is preserved — it provides useful signal
        for tag and description extraction (example prompts describe what
        the model generates)."""
        md = """---
 tags:
 - lora
 - anime
 widget:
 - text: "a test prompt"
  output:
    url: images/test.png
 - text: >-
    another long
    prompt here
  output:
    url: images/test2.png
 base_model: black-forest-labs/FLUX.1-dev
 instance_prompt: trigger word
 ---
 # Model Description
 This is the actual content.
 """
        result = self._clean(md)
        # Widget text content preserved (valuable signal for tags)
        # YAML folded scalars (``>-``) may split text across lines
        assert "a test prompt" in result
        assert "another long" in result
        assert "prompt here" in result
        # Non-widget frontmatter preserved
        assert "base_model: black-forest-labs/FLUX.1-dev" in result
        assert "instance_prompt: trigger word" in result
        assert "tags:" in result
        assert "- lora" in result
        assert "- anime" in result
        assert "Model Description" in result
    def test_widget_last_key_in_frontmatter(self):
        """Widget text at end of frontmatter is preserved."""
        md = """---
 tags:
 - lora
 widget:
 - output:
    url: img.png
  text: prompt
 ---
 # Content
 """
        result = self._clean(md)
        assert "prompt" in result
        assert "tags:" in result
    def test_no_widget_untouched(self):
        md = """---
 tags:
 - lora
 base_model: flux
 ---
 # Content
 """
        result = self._clean(md)
        assert "tags:" in result
        assert "base_model: flux" in result
    # -- gallery stripping ---------------------------------------------------
    def test_gallery_tag_stripped(self):
        md = "Some text\n<Gallery />\nmore text"
        result = self._clean(md)
        assert "<Gallery" not in result
    # -- code block stripping ------------------------------------------------
    def test_fenced_code_block_stripped(self):
        md = """## Usage
 ```python
 import torch
 pipe = DiffusionPipeline.from_pretrained('base')
 ```
 ## Description
 Some text.
 """
        result = self._clean(md)
        assert "import torch" not in result
        assert "DiffusionPipeline" not in result
        assert "## Usage" in result
        assert "## Description" in result
    def test_bash_code_block_stripped(self):
        md = """## Setup
 ```bash
 pip install diffusers
 huggingface-cli download repo
 ```
 """
        result = self._clean(md)
        assert "pip install" not in result
        assert "## Setup" in result
    def test_code_block_sections_remain_separated(self):
        md = "## Install\n```bash\npip install x\n```\n\n## Usage\nSome text."
        result = self._clean(md)
        assert "pip install" not in result
        assert "## Install" in result
        assert "## Usage" in result
        assert "Some text." in result
    def test_unmarked_code_block_preserved(self):
        """Unmarked fenced code blocks (just ```) are kept since they
        often contain trigger words rather than code."""
        md = """### Trigger Words
 Always include:
 ```
 pixel art sprite, game asset, transparent background
 ```
 """
        result = self._clean(md)
        assert "pixel art sprite" in result
        assert "game asset" in result
        assert "transparent background" in result
    def test_unmarked_code_block_with_python_preserved(self):
        """Even unmarked blocks with Python code are kept (false positive
        accepted because trigger-word blocks are unmarked)."""
        md = "## Setup\n```\nimport torch\nprint('hello')\n```\n## Desc\nText."
        result = self._clean(md)
        assert "import torch" in result
    # -- standalone image stripping ------------------------------------------
    def test_standalone_image_stripped(self):
        md = "## Gallery\n![sample](https://cdn.hf.co/img.png)\n![another](https://cdn.hf.co/img2.png)\n\nSome text."
        result = self._clean(md)
        assert "cdn.hf.co" not in result
        assert "sample" in result  # alt text preserved
        assert "another" in result  # alt text preserved
        assert "## Gallery" in result
        assert "Some text." in result
    def test_html_img_tag_stripped(self):
        md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
        result = self._clean(md)
        assert "cdn.hf.co" not in result
        assert "Description." in result
    def test_inline_image_within_paragraph_preserved(self):
        """Inline images inside paragraphs are rare but shouldn't be stripped."""
        md = "Click here ![icon](https://example.com/icon.png) for more info."
        result = self._clean(md)
        assert "Click here" in result
        assert "for more info" in result
    # -- training table stripping --------------------------------------------
    def test_training_table_stripped(self):
        md = """## Training
 | Parameter     | Value    |
 |---------------|----------|
 | LR Scheduler  | constant |
 | Optimizer     | AdamW    |
 | Network Dim   | 64       |
 ## Best Dimensions
 | Resolution | Status  |
 |-----------|---------|
 | 768x1024  | Best    |
 """
        result = self._clean(md)
        assert "LR Scheduler" not in result
        assert "Optimizer" not in result
        assert "Network Dim" not in result
        # Normal table preserved
        assert "Best Dimensions" in result
        assert "768x1024" in result
    def test_normal_table_preserved(self):
        md = """## Recommended
 | Resolution | Status  |
 |-----------|---------|
 | 1024x1024 | Default |
 """
        result = self._clean(md)
        assert "1024x1024" in result
    # -- boilerplate section stripping ---------------------------------------
    def test_boilerplate_license_stripped(self):
        md = """## Description
 Some text.
 ## License
 apache-2.0
 Some license details here.
 ## More Content
 After license.
 """
        result = self._clean(md)
        assert "apache-2.0" not in result
        assert "## License" not in result
        assert "## Description" in result
        assert "## More Content" in result
        assert "After license." in result
    def test_boilerplate_disclaimer_stripped(self):
        md = """## Description
 Some text.
 ## DISCLAIMER
 Legal text here.
 ## Citation
 Bibtex here.
 """
        result = self._clean(md)
        assert "Legal text" not in result
        assert "Bibtex" not in result
        assert "Some text." in result
    def test_boilerplate_subsection_not_stripped(self):
        """Only top-level (##) boilerplate is stripped; ### subsections inside
        non-boilerplate headings are left alone."""
        md = """## Usage
 Some text.
 ### Important Note
 This is a note within the usage section.
 """
        result = self._clean(md)
        assert "Important Note" in result
    # -- massive list stripping ----------------------------------------------
    def test_massive_name_list_stripped(self):
        lines = ["## 2026 Updates:"]
        for i in range(12):
            lines.append(f"Name{i}A, Name{i}B, Name{i}C, Name{i}D, Name{i}E,")
        lines.append("## License")
        lines.append("apache")
        md = "\n".join(lines)
        result = self._clean(md)
        assert "Name0A" not in result
        assert "Name11E" not in result
        assert "## 2026 Updates:" in result
        # License stripped by boilerplate
        assert "apache" not in result
    def test_short_list_preserved(self):
        """Short lists (< 8 consecutive lines) should not be stripped."""
        lines = ["## Tags:"]
        for i in range(4):
            lines.append(f"tag{i}A, tag{i}B,")
        lines.append("## Description")
        lines.append("Some text.")
        md = "\n".join(lines)
        result = self._clean(md)
        assert "tag0A" in result
        assert "tag3B" in result
    # -- max_length truncation -----------------------------------------------
    def test_truncation(self):
        md = "A" * 100 + "\n" + "B" * 100
        result = self._clean(md, max_length=150)
        assert len(result) <= 150
        assert result.startswith("A" * 100)
    # -- integration: end-to-end realistic README ----------------------------
    def test_realistic_flux_lora_readme(self):
        md = """---
 tags:
 - text-to-image
 - lora
 - diffusers
 - 3D
 - Toon
 widget:
 - text: >-
    Long toons, a close-up of a cartoon character face...
  output:
    url: images/LT4.png
 - text: >-
    Long toons, Super Detail, a close-up shot...
  output:
    url: images/LT5.png
 base_model: black-forest-labs/FLUX.1-dev
 instance_prompt: Long toons
 license: creativeml-openrail-m
 ---
 # Flux-Long-Toon-LoRA
 <Gallery />
 **The model is still in the training phase.**
 ## Model description
 **prithivMLmods/Flux-Long-Toon-LoRA**
 Image Processing Parameters
 | Parameter                 | Value  | Parameter                 | Value  |
 |---------------------------|--------|---------------------------|--------|
 | LR Scheduler              | constant | Noise Offset              | 0.03   |
 | Optimizer                 | AdamW  | Multires Noise Discount   | 0.1    |
 | Network Dim               | 64     | Multires Noise Iterations | 10     |
 | Network Alpha             | 32     | Repeat & Steps           | 25 & 3270 |
 | Epoch                     | 18    | Save Every N Epochs       | 1     |
 ## Best Dimensions
 - 768 x 1024 (Best)
 - 1024 x 1024 (Default)
 ## Setting Up
 ```python
 import torch
 from pipelines import DiffusionPipeline
 base_model = "black-forest-labs/FLUX.1-dev"
 pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.bfloat16)
 lora_repo = "prithivMLmods/Flux-Long-Toon-LoRA"
 trigger_word = "Long toons"
 pipe.load_lora_weights(lora_repo)
 ```
 ## Trigger words
 You should use `Long toons` to trigger the image generation.
 ## Download model
 Weights for this model are available in Safetensors format.
 """
        original_len = len(md)
        result = self._clean(md)
        # Still significantly smaller (widget text is kept but training
        # tables, code blocks, boilerplate are stripped)
        assert len(result) < original_len * 0.7, (
            f"Expected <70% of original, got {len(result)}/{original_len}"
        )
        # Signal preserved
        assert "Long toons" in result
        assert "black-forest-labs/FLUX.1-dev" in result
        assert "3D" in result
        assert "Toon" in result
        # Widget content preserved (text is valuable signal for tags/desc)
        assert "close-up of a cartoon character face" in result
        assert "Super Detail" in result
        # Noise stripped
        assert "import torch" not in result
        assert "DiffusionPipeline" not in result
        assert "LR Scheduler" not in result
        assert "<Gallery" not in result
        assert "Download model" not in result