feat(agent): optimize enrich_hf_metadata with README cleaning, Ollama native API, and expanded fields

- Add clean_readme_for_llm() to strip noise from README before LLM injection - Keep widget section text (valuable tag signal) and unmarked code blocks (trigger words) - Preserve standalone image alt text instead of removing entirely - Switch Ollama to native /api/chat with think:false to fix empty content on thinking models - Extract Sample Gallery table images and deduplicate with widget images - Only strip code blocks with explicit language tags (bash) - Add notes and usage_tips fields to SKILL.md output format and post-processor - Clean up dead code, fix regex edge cases, remove double type annotation
2026-07-05 17:01:16 -03:00 · 2026-07-04 08:01:50 +08:00
parent b22f09bd1d
commit a1fd4e150b
6 changed files with 937 additions and 30 deletions
--- a/py/services/agent/agent_service.py
+++ b/py/services/agent/agent_service.py
@@ -28,6 +28,7 @@ from ..llm_service import LLMService
 from ..websocket_manager import ws_manager
 from .post_processor import PostProcessor
 from .skill_registry import SkillRegistry
+from .skills.enrich_hf_metadata.md_to_html import clean_readme_for_llm

 logger = logging.getLogger(__name__)

@@ -368,7 +369,8 @@ class AgentService:
        context["repo"] = repo or ""
        if repo:
            readme = await self._fetch_readme(repo)
-            context["readme_content"] = readme[:8000] if readme else "(README not available)"
+            cleaned = clean_readme_for_llm(readme) if readme else ""
+            context["readme_content"] = cleaned if cleaned else "(README not available)"
            context["readme_content_full"] = readme or ""

        try:
--- a/py/services/agent/post_processor.py
+++ b/py/services/agent/post_processor.py
@@ -10,6 +10,7 @@ refresh cache).  All actual I/O is delegated to :mod:`~py.agent_cli`.

 from __future__ import annotations

+import json
 import logging
 import os
 from datetime import datetime, timezone
@@ -79,6 +80,7 @@ class PostProcessor:
        from .skills.enrich_hf_metadata.md_to_html import (
            convert_readme_to_html,
            extract_gallery_images,
+            extract_gallery_table_images,
            extract_repo_from_hf_url,
        )

@@ -127,23 +129,38 @@ class PostProcessor:
            desc_civitai["description"] = short_desc
            updates["civitai"] = desc_civitai

-        # gallery images → civitai.images (from YAML frontmatter widget entries)
+        # gallery images → civitai.images (from YAML frontmatter widget entries
+        # and Sample Gallery markdown tables in the README body)
+        gallery_images: List[Dict[str, Any]] = []
        if readme_content and is_hf_model:
            hf_url = metadata.get("hf_url", "") or ""
            repo = extract_repo_from_hf_url(hf_url)
            if repo:
                rec_w = llm_output.get("recommended_width") or 0
                rec_h = llm_output.get("recommended_height") or 0
+
+                # 1. Widget images (YAML frontmatter)
                gallery = extract_gallery_images(
                    readme_content, repo,
                    default_width=rec_w, default_height=rec_h,
                )
-                if gallery:
+
+                # 2. Sample Gallery table images (markdown body), deduplicated
+                existing_urls = {img["url"] for img in gallery if img.get("url")}
+                table_images = extract_gallery_table_images(
+                    readme_content, repo,
+                    existing_urls=existing_urls,
+                    default_width=rec_w, default_height=rec_h,
+                )
+
+                all_images = gallery + table_images
+                if all_images:
+                    gallery_images = all_images
                    current_civitai = metadata.get("civitai") or {}
                    gallery_civitai = dict(current_civitai)
                    if "civitai" in updates and isinstance(updates["civitai"], dict):
                        gallery_civitai.update(updates["civitai"])
-                    gallery_civitai["images"] = gallery
+                    gallery_civitai["images"] = all_images
                    updates["civitai"] = gallery_civitai

        # tags
@@ -159,6 +176,11 @@ class PostProcessor:
        updates["llm_enriched_at"] = datetime.now(timezone.utc).isoformat()

        preview_remote_url = (llm_output.get("preview_url") or "").strip()
+        # Fallback: if the LLM couldn't find a preview image in the cleaned
+        # README, use the first gallery image extracted from the YAML widget
+        # section.
+        if not preview_remote_url and gallery_images:
+            preview_remote_url = gallery_images[0].get("url", "")
        current_preview = metadata.get("preview_url") or ""
        if preview_remote_url and not (current_preview and os.path.exists(current_preview)):
            local_path = await download_preview(model_path, preview_remote_url)
@@ -166,6 +188,22 @@ class PostProcessor:
                preview_downloaded = True
                updates["preview_url"] = local_path

+        # notes — plain-text summary of usage info from the LLM
+        new_notes = (llm_output.get("notes") or "").strip()
+        if new_notes:
+            updates["notes"] = new_notes
+
+        # usage_tips — JSON string (e.g. {"strength_min":0.85,"strength_max":1.4})
+        raw_tips = (llm_output.get("usage_tips") or "").strip()
+        if raw_tips and raw_tips != "{}":
+            try:
+                json.loads(raw_tips)
+                updates["usage_tips"] = raw_tips
+            except (json.JSONDecodeError, TypeError):
+                logger.warning(
+                    "LLM returned invalid usage_tips JSON: %s", raw_tips[:200]
+                )
+
        if updates:
            updated_fields = await apply_metadata_updates(model_path, updates)

--- a/py/services/agent/skills/enrich_hf_metadata/SKILL.md
+++ b/py/services/agent/skills/enrich_hf_metadata/SKILL.md
@@ -84,6 +84,25 @@ The recommended image generation resolution for this model, in pixels. Look for
 ### preview_url
 The URL of the most suitable preview image from the README. Look for image tags (e.g. `![alt](url)`) and the YAML frontmatter `widget:` section (which often has `output.url` fields). Choose the first image that appears to be a generation example (not a logo or diagram). Construct the absolute URL as `https://huggingface.co/{{repo}}/resolve/main/{filename}`. If no suitable image is found, return an empty string.

+### notes
+A plain-text summary of the model card's key practical usage information. Combine trigger words, style modifiers, recommended parameters (steps, CFG, resolution, sampler), and any setup tips into a readable paragraph. Return empty string if the README has no useful usage info.
+
+### usage_tips
+A JSON string with structured usage recommendations. Extract from the README any explicit ranges or recommended values (e.g. "Set LoRA strength: **0.85 - 1.4**", "CLIP strength: 0.5"). Possible fields (include only those you can determine):
+
+```json
+{
+  "strength_min": 0.85,
+  "strength_max": 1.4,
+  "strength_range": "0.85-1.4",
+  "strength": 0.6,
+  "clip_strength": 0.5,
+  "clip_skip": 2
+}
+```
+
+Return the JSON string (e.g. `'{"strength_min":0.85,"strength_max":1.4}'`). Return `"{}"` if nothing useful is found.
+
 ### confidence
 Your confidence level in the extracted data:
 - "high" — most fields were explicitly stated in the README
@@ -104,6 +123,8 @@ Return ONLY a JSON object with exactly these fields (no markdown fences, no extr
  "recommended_width": 768,
  "recommended_height": 1024,
  "preview_url": "<image URL or empty string>",
+  "notes": "<plain-text usage summary or empty string>",
+  "usage_tips": "<JSON string like '{\"strength_min\":0.85,\"strength_max\":1.4}' or '{}'>",
  "confidence": "<high|medium|low>"
 }
 ```
--- a/py/services/agent/skills/enrich_hf_metadata/md_to_html.py
+++ b/py/services/agent/skills/enrich_hf_metadata/md_to_html.py
@@ -1,8 +1,13 @@
-"""Inline markdown-to-HTML converter for HF README content.
+"""Inline markdown-to-HTML converter and LLM-prompt cleaner for HF README content.

 No external dependencies.  Strips YAML frontmatter, ``<Gallery />`` sections,
 badge images, and HTML comments before rendering.  Only used by the
 ``enrich_hf_metadata`` skill.
+
+Also provides :func:`clean_readme_for_llm` which pre-processes the raw README
+before it is injected into the LLM prompt, removing content that has zero value
+for metadata extraction (widget sections, code blocks, training tables,
+boilerplate, massive lists, etc.).
 """

 from __future__ import annotations
@@ -118,6 +123,88 @@ def extract_gallery_images(
    return images


+def extract_gallery_table_images(
+    markdown_text: str,
+    repo: str,
+    existing_urls: set | None = None,
+    default_width: int = 512,
+    default_height: int = 512,
+) -> list[dict]:
+    """Extract images from ``| Preview | Prompt |`` markdown gallery tables.
+
+    Many HF READMEs include a sample-gallery table in the body (outside
+    the YAML frontmatter) that shows generation examples with their
+    prompts.  This function parses those tables and merges results with
+    the widget-sourced images from :func:`extract_gallery_images`.
+
+    Returns a list of dicts in the same ``civitai.images`` format as
+    :func:`extract_gallery_images`.  Already-seen URLs (from *existing_urls*)
+    are skipped.
+    """
+    if not markdown_text or not repo:
+        return []
+
+    base_url = f"https://huggingface.co/{repo}/resolve/main"
+    images: list[dict] = []
+    seen_urls: set = set(existing_urls) if existing_urls else set()
+    lines = markdown_text.split("\n")
+    n = len(lines)
+    i = 0
+
+    while i < n:
+        line = lines[i]
+        if "|" not in line or i + 1 >= n:
+            i += 1
+            continue
+
+        # Check for table separator row
+        if not re.match(r"^\|[\s:-]+\|", lines[i + 1]):
+            i += 1
+            continue
+
+        header_lower = line.strip().lower()
+        first_cell = header_lower.strip("|").split("|")[0].strip() if "|" in header_lower else ""
+        is_gallery = any(kw in first_cell for kw in ("preview", "sample", "gallery", "image", "thumbnail"))
+        if not is_gallery:
+            i += 1
+            continue
+
+        # Skip header + separator
+        i += 2
+        while i < n and "|" in lines[i]:
+            cells = [c.strip() for c in lines[i].strip().strip("|").split("|")]
+            if len(cells) >= 2:
+                first = cells[0]
+                prompt = cells[1]
+
+                url_match = re.search(r"!\[([^\]]*)\]\(([^)]+)\)", first)
+                if url_match:
+                    raw_path = url_match.group(2)
+                    if raw_path.startswith("http"):
+                        url = raw_path
+                    else:
+                        # Normalise: remove leading / and ./ prefixes
+                        clean = raw_path.lstrip("./").lstrip("/")
+                        url = f"{base_url}/{clean}"
+
+                    if url not in seen_urls:
+                        seen_urls.add(url)
+                        images.append({
+                            "url": url,
+                            "type": "image",
+                            "nsfwLevel": 0,
+                            "width": default_width,
+                            "height": default_height,
+                            "meta": {"prompt": prompt, "negativePrompt": ""},
+                            "hasMeta": bool(prompt),
+                            "hasPositivePrompt": bool(prompt),
+                        })
+            i += 1
+        continue
+
+    return images
+
+
 def _extract_frontmatter(text: str) -> str:
    """Return the YAML frontmatter content (without the ``---`` delimiters).

@@ -145,7 +232,260 @@ def convert_readme_to_html(markdown_text: str | None) -> str:


 # ---------------------------------------------------------------------------
-# Pre-processing: strip unwanted sections
+# README cleaning for LLM prompt injection
+# ---------------------------------------------------------------------------
+
+#: Section headers that signal boilerplate content with zero metadata value.
+_BOILERPLATE_HEADERS: tuple[str, ...] = (
+    "download model",
+    "license",
+    "citation",
+    "links",
+    "disclaimer",
+    "architecture notes",
+    "training details",
+    "dataset",
+    "provenance",
+)
+
+#: Table header keywords that identify training-parameter tables.
+_TRAINING_PARAM_KEYWORDS: tuple[str, ...] = (
+    "lr scheduler",
+    "optimizer",
+    "network dim",
+    "network alpha",
+    "noise offset",
+    "multires noise",
+    "repeat",
+    "epoch",
+    "batch size",
+    "gradient accumulation",
+    "learning rate",
+    "rslora",
+    "dtype",
+)
+
+#: Maximum chars before a single-line comma list is considered massive.
+_MASSIVE_LIST_LINE_MIN_LEN = 150
+#: Minimum consecutive enumeration lines to trigger massive-list stripping.
+_MASSIVE_LIST_THRESHOLD = 8
+
+
+def clean_readme_for_llm(markdown_text: str | None, max_length: int = 6000) -> str:
+    """Clean a HF README for injection into an LLM metadata-extraction prompt.
+
+    Removes content that carries no signal for inferring base model,
+    trigger words, short description, tags, or a preview image URL:
+
+    * ``widget:`` YAML block (example prompts + output URLs)
+    * ``<Gallery />`` tags and wrappers
+    * Fenced code blocks (Python / bash / bibtex / yaml)
+    * Standalone ``![...](...)`` image lines and ``<img>`` tags
+    * Training-parameter tables
+    * Boilerplate sections (Download / License / Citation / …)
+    * Massive enumeration lists (e.g. 3000+ celebrity names)
+
+    The post-processor still receives the **full** raw README via
+    ``readme_content_full``, so nothing is lost for HTML conversion or
+    gallery-image extraction.
+
+    Args:
+        markdown_text: Raw README.md content from HuggingFace.
+        max_length: Hard ceiling on output length (default 6 000 chars).
+
+    Returns:
+        Cleaned markdown, truncated to *max_length*.
+    """
+    if not markdown_text:
+        return ""
+
+    text = markdown_text
+
+    # Order matters — broader strips first, then finer ones.
+    text = _strip_gallery(text)
+    text = _strip_fenced_code_blocks(text)
+    text = _strip_standalone_images(text)
+    text = _strip_training_tables(text)
+    text = _strip_boilerplate_sections(text)
+    text = _strip_massive_lists(text)
+    text = _strip_badge_images(text)
+    text = _strip_html_comments(text)
+    text = _compress_blank_lines(text)
+
+    if len(text) > max_length:
+        text = text[:max_length]
+
+    return text.strip()
+
+
+def _strip_fenced_code_blocks(text: str) -> str:
+    """Strip fenced code blocks that have an explicit programming-language tag.
+
+    Blocks without a language tag (just `` ``` ``) are preserved — they
+    often contain trigger words, example prompts, or config snippets
+    rather than actual runnable code.
+    """
+    # Match opening ``` immediately followed by a word character (the language
+    # tag), then any content, then closing ```.  Plain ``` at the start of a
+    # line is left intact.  A leading \n is optional (handles blocks at the
+    # start of the text).
+    return re.sub(
+        r"(?:\n|^)```[a-zA-Z_][a-zA-Z0-9_]*\s*\n.*?\n```",
+        "",
+        text,
+        flags=re.DOTALL,
+    )
+
+
+def _strip_standalone_images(text: str) -> str:
+    """Strip image embeds that occupy their own line.
+
+    Preserves the alt text from markdown images (``![alt](url)`` → ``alt``)
+    since it often describes what the model generates, which is useful signal
+    for tag/description extraction.
+    """
+    # Markdown: ``![alt](url)`` on its own line → keep alt text
+    text = re.sub(
+        r"^\s*!\[([^\]]*)\]\([^)]+\)\s*$",
+        r"\1",
+        text,
+        flags=re.MULTILINE,
+    )
+    # HTML: ``<img src="..." ...>`` on its own line → remove entirely
+    text = re.sub(
+        r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
+        "",
+        text,
+        flags=re.MULTILINE | re.IGNORECASE,
+    )
+    return text
+
+
+def _strip_training_tables(text: str) -> str:
+    """Strip markdown tables whose header row mentions training parameters.
+
+    Checks the header row (first line of a detected table) against
+    ``_TRAINING_PARAM_KEYWORDS``.  Non-training tables (e.g. "Best
+    Dimensions") are preserved.
+    """
+    lines = text.split("\n")
+    out: list[str] = []
+    i = 0
+    n = len(lines)
+
+    while i < n:
+        line = lines[i]
+        if "|" in line and i + 1 < n and re.match(r"^\|[\s:-]+\|", lines[i + 1]):
+            table_lines = [line]
+            i += 1
+            while i < n and "|" in lines[i]:
+                table_lines.append(lines[i])
+                i += 1
+
+            # Check header + first data row for training keywords
+            header_and_first = (line + "\n" + (table_lines[2] if len(table_lines) > 2 else "")).lower()
+            if any(kw in header_and_first for kw in _TRAINING_PARAM_KEYWORDS):
+                continue
+            out.extend(table_lines)
+        else:
+            out.append(line)
+            i += 1
+
+    return "\n".join(out)
+
+
+def _strip_boilerplate_sections(text: str) -> str:
+    """Strip sections whose headings match known boilerplate patterns.
+
+    When a heading (``## Download model``, ``## License``, etc.) is
+    detected, the heading and all content until the next heading of
+    equal-or-higher level is removed.
+    """
+    lines = text.split("\n")
+    out: list[str] = []
+    i = 0
+    n = len(lines)
+    skip_until_level: int | None = None
+
+    while i < n:
+        line = lines[i]
+        h_match = re.match(r"^(#{1,4})\s+(.+?)\s*#*$", line)
+        if h_match:
+            level = len(h_match.group(1))
+            title = h_match.group(2).strip().lower()
+
+            is_boilerplate = any(
+                title == kw or title.startswith(kw + " ") or title.startswith(kw + ":")
+                for kw in _BOILERPLATE_HEADERS
+            )
+
+            if is_boilerplate:
+                skip_until_level = level
+                i += 1
+                continue
+
+            if skip_until_level is not None and level <= skip_until_level:
+                skip_until_level = None
+
+        if skip_until_level is None:
+            out.append(line)
+        i += 1
+
+    return "\n".join(out)
+
+
+def _strip_massive_lists(text: str) -> str:
+    """Strip blocks of 8+ consecutive enumeration-style lines.
+
+    Targets long comma-separated name lists (e.g. the 3000+ celebrity
+    names in some Z-Image READMEs) and dense bullet enumerations.
+    """
+    lines = text.split("\n")
+    out: list[str] = []
+    i = 0
+    n = len(lines)
+
+    while i < n:
+        stripped = lines[i].strip()
+
+        # A "list-like" line ends with comma or is a bullet with commas
+        is_list_like = bool(stripped) and (
+            stripped.endswith(",")
+            or len(stripped) >= _MASSIVE_LIST_LINE_MIN_LEN
+            or (bool(re.match(r"^[-*+]\s", stripped)) and "," in stripped)
+        )
+
+        if is_list_like:
+            count = 1
+            j = i + 1
+            while j < n:
+                s = lines[j].strip()
+                if not s:
+                    j += 1
+                    continue
+                if s.endswith(",") or (bool(re.match(r"^[-*+]\s", s)) and "," in s):
+                    count += 1
+                    j += 1
+                else:
+                    break
+
+            if count >= _MASSIVE_LIST_THRESHOLD:
+                i = j
+                continue
+
+        out.append(lines[i])
+        i += 1
+
+    return "\n".join(out)
+
+
+def _compress_blank_lines(text: str) -> str:
+    """Collapse runs of 3+ blank lines down to 2."""
+    return re.sub(r"\n{3,}", "\n\n", text)
+
+
+# ---------------------------------------------------------------------------
+# Pre-processing: strip unwanted sections (HTML conversion helpers)
 # ---------------------------------------------------------------------------


--- a/py/services/llm_service.py
+++ b/py/services/llm_service.py
@@ -333,18 +333,53 @@ class LLMService:

        cfg = self._ensure_configured()
        api_base = self._resolve_api_base(cfg["provider"], cfg["api_base"])
-        url = f"{api_base}/chat/completions"
        model_name = model or cfg["model"]

-        payload: Dict[str, Any] = {
-            "model": model_name,
-            "messages": messages,
-            "temperature": temperature,
-        }
-        if response_format is not None:
-            payload["response_format"] = response_format
-        if max_tokens is not None:
-            payload["max_tokens"] = max_tokens
+        is_ollama = cfg["provider"] == "ollama"
+
+        if is_ollama:
+            # Use Ollama's native /api/chat endpoint which does NOT expose
+            # a separate reasoning/thinking field (the model's full output
+            # lands directly in message.content).  The OpenAI-compatible
+            # endpoint splits thinking into the "reasoning" field, making
+            # content empty when thinking consumes all available tokens.
+            base = api_base.rstrip("/")
+            if base.endswith("/v1"):
+                base = base[:-3]
+            url = f"{base}/api/chat"
+        else:
+            url = f"{api_base}/chat/completions"
+
+        payload: Dict[str, Any]
+        if is_ollama:
+            payload = {
+                "model": model_name,
+                "messages": messages,
+                "stream": False,
+                # Suppress separate thinking trace — thinking still happens
+                # internally (accuracy preserved) but output goes directly to
+                # message.content instead of being split across content +
+                # thinking.  Without this the model can exhaust num_predict
+                # on thinking alone and leave content empty.
+                "think": False,
+                "options": {
+                    "temperature": temperature,
+                },
+            }
+            if response_format is not None:
+                payload["format"] = "json"
+            if max_tokens is not None:
+                payload["options"]["num_predict"] = max_tokens
+        else:
+            payload = {
+                "model": model_name,
+                "messages": messages,
+                "temperature": temperature,
+            }
+            if response_format is not None:
+                payload["response_format"] = response_format
+            if max_tokens is not None:
+                payload["max_tokens"] = max_tokens

        headers = self._build_headers(cfg["api_key"])

@@ -387,8 +422,25 @@ class LLMService:

            # Parse response
            try:
-                content = data["choices"][0]["message"]["content"]
-                usage = data.get("usage", {})
+                if is_ollama:
+                    content = (data.get("message") or {}).get("content") or ""
+                    usage = {"completion_tokens": data.get("eval_count", 0)}
+                    finish_reason = data.get("done_reason", "")
+                    if not content:
+                        logger.warning(
+                            "LLM returned empty content. Provider=ollama, "
+                            "done_reason=%s, eval_count=%s",
+                            finish_reason,
+                            data.get("eval_count", 0),
+                        )
+                else:
+                    content = data["choices"][0]["message"].get("content") or ""
+                    usage = data.get("usage", {})
+                    if not content:
+                        logger.warning(
+                            "LLM returned empty content. Full response truncated: %s",
+                            json.dumps(data, ensure_ascii=False)[:1000],
+                        )
                return {
                    "content": content,
                    "usage": usage,
@@ -442,13 +494,16 @@ class LLMService:
            {"role": "user", "content": user_prompt},
        ]

-        # First attempt with JSON mode
+        # First attempt with JSON mode.
+        # Use a generous max_tokens so thinking-enabled models (e.g.
+        # gemma4 via Ollama) have room to reason AND still emit content.
+        effective_max = max_tokens or 131072
        result = await self.chat_completion(
            messages=messages,
            model=model,
            temperature=temperature,
            response_format={"type": "json_object"},
-            max_tokens=max_tokens,
+            max_tokens=effective_max,
        )

        try:
@@ -458,11 +513,15 @@ class LLMService:
                "LLM JSON parse failed on first attempt: %s. Retrying.", exc
            )

-        # Retry with explicit instruction to return valid JSON
+        # Retry WITHOUT response_format — some providers (Ollama with
+        # thinking-enabled models like gemma4) may return empty content
+        # when json_object mode is active.  Fall back to a textual
+        # instruction instead.
+        previous_content = result.get("content", "") or ""
        retry_messages = messages + [
            {
                "role": "assistant",
-                "content": result["content"],
+                "content": previous_content or "(empty response)",
            },
            {
                "role": "user",
@@ -478,14 +537,21 @@ class LLMService:
            messages=retry_messages,
            model=model,
            temperature=0.0,  # More deterministic for retry
-            response_format={"type": "json_object"},
-            max_tokens=max_tokens,
+            max_tokens=effective_max,
        )

-        try:
-            return json.loads(result["content"])
-        except (json.JSONDecodeError, TypeError) as exc:
+        content = result.get("content", "") or ""
+        if not content:
            raise LLMResponseError(
-                f"LLM response could not be parsed as JSON after retry: {exc}\n"
-                f"Raw content: {result['content'][:500]}"
-            ) from exc
+                "LLM response could not be parsed as JSON after retry: "
+                f"Expecting value: line 1 column 1 (char 0)\n"
+                f"Raw content: {content[:500]}"
+            )
+
+        try:
+            return json.loads(content)
+        except (json.JSONDecodeError, TypeError) as parse_err:
+            raise LLMResponseError(
+                f"LLM response could not be parsed as JSON after retry: {parse_err}\n"
+                f"Raw content: {content[:500]}"
+            ) from parse_err
--- a/tests/agent_cli/test_agent_cli.py
+++ b/tests/agent_cli/test_agent_cli.py
@@ -583,3 +583,443 @@ widget:
        assert len(images) == 1
        assert "two samurais doing a muay thai fight" in images[0]["meta"]["prompt"]
        assert "Textured abstract style" in images[0]["meta"]["prompt"]
+
+
+# ======================================================================
+# extract_gallery_table_images  —  Sample Gallery markdown tables
+# ======================================================================
+
+
+class TestExtractGalleryTableImages:
+
+    _REPO = "Limbicnation/pixel-art-lora"
+    _README = """## Sample Gallery
+
+| Preview | Prompt |
+|---------|--------|
+| ![Knight](./samples/knight.png) | pixel art sprite, a brave knight |
+| ![Dragon](./samples/dragon.png) | pixel art sprite, a fire dragon |
+"""
+
+    @staticmethod
+    def _extract(md: str, repo: str = _REPO, existing: set | None = None):
+        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+            extract_gallery_table_images
+        return extract_gallery_table_images(md, repo, existing_urls=existing)
+
+    def test_extracts_table_images(self):
+        images = self._extract(self._README)
+        assert len(images) == 2
+        assert "knight.png" in images[0]["url"]
+        assert images[0]["meta"]["prompt"] == "pixel art sprite, a brave knight"
+        assert "dragon.png" in images[1]["url"]
+
+    def test_skips_existing_urls(self):
+        existing = {"https://huggingface.co/Limbicnation/pixel-art-lora/resolve/main/samples/knight.png"}
+        images = self._extract(self._README, existing=existing)
+        assert len(images) == 1
+        assert "knight.png" not in images[0]["url"]
+
+    def test_empty_readme_returns_empty(self):
+        assert self._extract("") == []
+
+    def test_no_gallery_table_returns_empty(self):
+        md = "## Description\nSome text."
+        assert self._extract(md) == []
+
+    def test_non_gallery_table_skipped(self):
+        md = "| Param | Value |\n|---|---|\n| Steps | 4 |"
+        assert self._extract(md) == []
+
+    def test_absolute_url_preserved(self):
+        md = "| Preview | Prompt |\n|---|---|\n| ![img](https://cdn.example.com/img.png) | text |"
+        images = self._extract(md, repo="user/repo")
+        assert len(images) == 1
+        assert images[0]["url"] == "https://cdn.example.com/img.png"
+
+
+# ======================================================================
+# clean_readme_for_llm  —  pre-process README before LLM injection
+# ======================================================================
+
+
+class TestCleanReadmeForLlm:
+
+    @staticmethod
+    def _clean(md: str, max_length: int = 6000) -> str:
+        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+            clean_readme_for_llm
+        return clean_readme_for_llm(md, max_length=max_length)
+
+    # -- basic guards --------------------------------------------------------
+
+    def test_none_returns_empty(self):
+        assert self._clean(None) == ""  # type: ignore[arg-type]
+
+    def test_empty_returns_empty(self):
+        assert self._clean("") == ""
+
+    def test_plain_text_passes_through(self):
+        result = self._clean("Just some description text.")
+        assert "Just some description text." in result
+
+    # -- widget section stripping -------------------------------------------
+
+    def test_widget_text_preserved_in_cleaned_output(self):
+        """Widget section text is preserved — it provides useful signal
+        for tag and description extraction (example prompts describe what
+        the model generates)."""
+        md = """---
+tags:
+- lora
+- anime
+widget:
+- text: "a test prompt"
+  output:
+    url: images/test.png
+- text: >-
+    another long
+    prompt here
+  output:
+    url: images/test2.png
+base_model: black-forest-labs/FLUX.1-dev
+instance_prompt: trigger word
+---
+# Model Description
+This is the actual content.
+"""
+        result = self._clean(md)
+        # Widget text content preserved (valuable signal for tags)
+        # YAML folded scalars (``>-``) may split text across lines
+        assert "a test prompt" in result
+        assert "another long" in result
+        assert "prompt here" in result
+        # Non-widget frontmatter preserved
+        assert "base_model: black-forest-labs/FLUX.1-dev" in result
+        assert "instance_prompt: trigger word" in result
+        assert "tags:" in result
+        assert "- lora" in result
+        assert "- anime" in result
+        assert "Model Description" in result
+
+    def test_widget_last_key_in_frontmatter(self):
+        """Widget text at end of frontmatter is preserved."""
+        md = """---
+tags:
+- lora
+widget:
+- output:
+    url: img.png
+  text: prompt
+---
+# Content
+"""
+        result = self._clean(md)
+        assert "prompt" in result
+        assert "tags:" in result
+
+    def test_no_widget_untouched(self):
+        md = """---
+tags:
+- lora
+base_model: flux
+---
+# Content
+"""
+        result = self._clean(md)
+        assert "tags:" in result
+        assert "base_model: flux" in result
+
+    # -- gallery stripping ---------------------------------------------------
+
+    def test_gallery_tag_stripped(self):
+        md = "Some text\n<Gallery />\nmore text"
+        result = self._clean(md)
+        assert "<Gallery" not in result
+
+    # -- code block stripping ------------------------------------------------
+
+    def test_fenced_code_block_stripped(self):
+        md = """## Usage
+```python
+import torch
+pipe = DiffusionPipeline.from_pretrained('base')
+```
+## Description
+Some text.
+"""
+        result = self._clean(md)
+        assert "import torch" not in result
+        assert "DiffusionPipeline" not in result
+        assert "## Usage" in result
+        assert "## Description" in result
+
+    def test_bash_code_block_stripped(self):
+        md = """## Setup
+```bash
+pip install diffusers
+huggingface-cli download repo
+```
+"""
+        result = self._clean(md)
+        assert "pip install" not in result
+        assert "## Setup" in result
+
+    def test_code_block_sections_remain_separated(self):
+        md = "## Install\n```bash\npip install x\n```\n\n## Usage\nSome text."
+        result = self._clean(md)
+        assert "pip install" not in result
+        assert "## Install" in result
+        assert "## Usage" in result
+        assert "Some text." in result
+
+    def test_unmarked_code_block_preserved(self):
+        """Unmarked fenced code blocks (just ```) are kept since they
+        often contain trigger words rather than code."""
+        md = """### Trigger Words
+
+Always include:
+
+```
+pixel art sprite, game asset, transparent background
+```
+"""
+        result = self._clean(md)
+        assert "pixel art sprite" in result
+        assert "game asset" in result
+        assert "transparent background" in result
+
+    def test_unmarked_code_block_with_python_preserved(self):
+        """Even unmarked blocks with Python code are kept (false positive
+        accepted because trigger-word blocks are unmarked)."""
+        md = "## Setup\n```\nimport torch\nprint('hello')\n```\n## Desc\nText."
+        result = self._clean(md)
+        assert "import torch" in result
+
+    # -- standalone image stripping ------------------------------------------
+
+    def test_standalone_image_stripped(self):
+        md = "## Gallery\n![sample](https://cdn.hf.co/img.png)\n![another](https://cdn.hf.co/img2.png)\n\nSome text."
+        result = self._clean(md)
+        assert "cdn.hf.co" not in result
+        assert "sample" in result  # alt text preserved
+        assert "another" in result  # alt text preserved
+        assert "## Gallery" in result
+        assert "Some text." in result
+
+    def test_html_img_tag_stripped(self):
+        md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
+        result = self._clean(md)
+        assert "cdn.hf.co" not in result
+        assert "Description." in result
+
+    def test_inline_image_within_paragraph_preserved(self):
+        """Inline images inside paragraphs are rare but shouldn't be stripped."""
+        md = "Click here ![icon](https://example.com/icon.png) for more info."
+        result = self._clean(md)
+        assert "Click here" in result
+        assert "for more info" in result
+
+    # -- training table stripping --------------------------------------------
+
+    def test_training_table_stripped(self):
+        md = """## Training
+| Parameter     | Value    |
+|---------------|----------|
+| LR Scheduler  | constant |
+| Optimizer     | AdamW    |
+| Network Dim   | 64       |
+## Best Dimensions
+| Resolution | Status  |
+|-----------|---------|
+| 768x1024  | Best    |
+"""
+        result = self._clean(md)
+        assert "LR Scheduler" not in result
+        assert "Optimizer" not in result
+        assert "Network Dim" not in result
+        # Normal table preserved
+        assert "Best Dimensions" in result
+        assert "768x1024" in result
+
+    def test_normal_table_preserved(self):
+        md = """## Recommended
+| Resolution | Status  |
+|-----------|---------|
+| 1024x1024 | Default |
+"""
+        result = self._clean(md)
+        assert "1024x1024" in result
+
+    # -- boilerplate section stripping ---------------------------------------
+
+    def test_boilerplate_license_stripped(self):
+        md = """## Description
+Some text.
+## License
+apache-2.0
+Some license details here.
+## More Content
+After license.
+"""
+        result = self._clean(md)
+        assert "apache-2.0" not in result
+        assert "## License" not in result
+        assert "## Description" in result
+        assert "## More Content" in result
+        assert "After license." in result
+
+    def test_boilerplate_disclaimer_stripped(self):
+        md = """## Description
+Some text.
+## DISCLAIMER
+Legal text here.
+## Citation
+Bibtex here.
+"""
+        result = self._clean(md)
+        assert "Legal text" not in result
+        assert "Bibtex" not in result
+        assert "Some text." in result
+
+    def test_boilerplate_subsection_not_stripped(self):
+        """Only top-level (##) boilerplate is stripped; ### subsections inside
+        non-boilerplate headings are left alone."""
+        md = """## Usage
+Some text.
+### Important Note
+This is a note within the usage section.
+"""
+        result = self._clean(md)
+        assert "Important Note" in result
+
+    # -- massive list stripping ----------------------------------------------
+
+    def test_massive_name_list_stripped(self):
+        lines = ["## 2026 Updates:"]
+        for i in range(12):
+            lines.append(f"Name{i}A, Name{i}B, Name{i}C, Name{i}D, Name{i}E,")
+        lines.append("## License")
+        lines.append("apache")
+        md = "\n".join(lines)
+        result = self._clean(md)
+        assert "Name0A" not in result
+        assert "Name11E" not in result
+        assert "## 2026 Updates:" in result
+        # License stripped by boilerplate
+        assert "apache" not in result
+
+    def test_short_list_preserved(self):
+        """Short lists (< 8 consecutive lines) should not be stripped."""
+        lines = ["## Tags:"]
+        for i in range(4):
+            lines.append(f"tag{i}A, tag{i}B,")
+        lines.append("## Description")
+        lines.append("Some text.")
+        md = "\n".join(lines)
+        result = self._clean(md)
+        assert "tag0A" in result
+        assert "tag3B" in result
+
+    # -- max_length truncation -----------------------------------------------
+
+    def test_truncation(self):
+        md = "A" * 100 + "\n" + "B" * 100
+        result = self._clean(md, max_length=150)
+        assert len(result) <= 150
+        assert result.startswith("A" * 100)
+
+    # -- integration: end-to-end realistic README ----------------------------
+
+    def test_realistic_flux_lora_readme(self):
+        md = """---
+tags:
+- text-to-image
+- lora
+- diffusers
+- 3D
+- Toon
+widget:
+- text: >-
+    Long toons, a close-up of a cartoon character face...
+  output:
+    url: images/LT4.png
+- text: >-
+    Long toons, Super Detail, a close-up shot...
+  output:
+    url: images/LT5.png
+base_model: black-forest-labs/FLUX.1-dev
+instance_prompt: Long toons
+license: creativeml-openrail-m
+---
+# Flux-Long-Toon-LoRA
+
+<Gallery />
+
+**The model is still in the training phase.**
+
+## Model description
+
+**prithivMLmods/Flux-Long-Toon-LoRA**
+
+Image Processing Parameters
+
+| Parameter                 | Value  | Parameter                 | Value  |
+|---------------------------|--------|---------------------------|--------|
+| LR Scheduler              | constant | Noise Offset              | 0.03   |
+| Optimizer                 | AdamW  | Multires Noise Discount   | 0.1    |
+| Network Dim               | 64     | Multires Noise Iterations | 10     |
+| Network Alpha             | 32     | Repeat & Steps           | 25 & 3270 |
+| Epoch                     | 18    | Save Every N Epochs       | 1     |
+
+## Best Dimensions
+
+- 768 x 1024 (Best)
+- 1024 x 1024 (Default)
+
+## Setting Up
+```python
+import torch
+from pipelines import DiffusionPipeline
+
+base_model = "black-forest-labs/FLUX.1-dev"
+pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.bfloat16)
+
+lora_repo = "prithivMLmods/Flux-Long-Toon-LoRA"
+trigger_word = "Long toons"
+pipe.load_lora_weights(lora_repo)
+```
+
+## Trigger words
+
+You should use `Long toons` to trigger the image generation.
+
+## Download model
+
+Weights for this model are available in Safetensors format.
+"""
+        original_len = len(md)
+        result = self._clean(md)
+
+        # Still significantly smaller (widget text is kept but training
+        # tables, code blocks, boilerplate are stripped)
+        assert len(result) < original_len * 0.7, (
+            f"Expected <70% of original, got {len(result)}/{original_len}"
+        )
+
+        # Signal preserved
+        assert "Long toons" in result
+        assert "black-forest-labs/FLUX.1-dev" in result
+        assert "3D" in result
+        assert "Toon" in result
+
+        # Widget content preserved (text is valuable signal for tags/desc)
+        assert "close-up of a cartoon character face" in result
+        assert "Super Detail" in result
+
+        # Noise stripped
+        assert "import torch" not in result
+        assert "DiffusionPipeline" not in result
+        assert "LR Scheduler" not in result
+        assert "<Gallery" not in result
+        assert "Download model" not in result