feat(agent): optimize enrich_hf_metadata with README cleaning, Ollama native API, and expanded fields

- Add clean_readme_for_llm() to strip noise from README before LLM injection
- Keep widget section text (valuable tag signal) and unmarked code blocks (trigger words)
- Preserve standalone image alt text instead of removing entirely
- Switch Ollama to native /api/chat with think:false to fix empty content on thinking models
- Extract Sample Gallery table images and deduplicate with widget images
- Only strip code blocks with explicit language tags (bash)
- Add notes and usage_tips fields to SKILL.md output format and post-processor
- Clean up dead code, fix regex edge cases, remove double type annotation
This commit is contained in:
Will Miao
2026-07-04 08:01:50 +08:00
parent b22f09bd1d
commit a1fd4e150b
6 changed files with 937 additions and 30 deletions

View File

@@ -28,6 +28,7 @@ from ..llm_service import LLMService
from ..websocket_manager import ws_manager
from .post_processor import PostProcessor
from .skill_registry import SkillRegistry
from .skills.enrich_hf_metadata.md_to_html import clean_readme_for_llm
logger = logging.getLogger(__name__)
@@ -368,7 +369,8 @@ class AgentService:
context["repo"] = repo or ""
if repo:
readme = await self._fetch_readme(repo)
context["readme_content"] = readme[:8000] if readme else "(README not available)"
cleaned = clean_readme_for_llm(readme) if readme else ""
context["readme_content"] = cleaned if cleaned else "(README not available)"
context["readme_content_full"] = readme or ""
try:

View File

@@ -10,6 +10,7 @@ refresh cache). All actual I/O is delegated to :mod:`~py.agent_cli`.
from __future__ import annotations
import json
import logging
import os
from datetime import datetime, timezone
@@ -79,6 +80,7 @@ class PostProcessor:
from .skills.enrich_hf_metadata.md_to_html import (
convert_readme_to_html,
extract_gallery_images,
extract_gallery_table_images,
extract_repo_from_hf_url,
)
@@ -127,23 +129,38 @@ class PostProcessor:
desc_civitai["description"] = short_desc
updates["civitai"] = desc_civitai
# gallery images → civitai.images (from YAML frontmatter widget entries)
# gallery images → civitai.images (from YAML frontmatter widget entries
# and Sample Gallery markdown tables in the README body)
gallery_images: List[Dict[str, Any]] = []
if readme_content and is_hf_model:
hf_url = metadata.get("hf_url", "") or ""
repo = extract_repo_from_hf_url(hf_url)
if repo:
rec_w = llm_output.get("recommended_width") or 0
rec_h = llm_output.get("recommended_height") or 0
# 1. Widget images (YAML frontmatter)
gallery = extract_gallery_images(
readme_content, repo,
default_width=rec_w, default_height=rec_h,
)
if gallery:
# 2. Sample Gallery table images (markdown body), deduplicated
existing_urls = {img["url"] for img in gallery if img.get("url")}
table_images = extract_gallery_table_images(
readme_content, repo,
existing_urls=existing_urls,
default_width=rec_w, default_height=rec_h,
)
all_images = gallery + table_images
if all_images:
gallery_images = all_images
current_civitai = metadata.get("civitai") or {}
gallery_civitai = dict(current_civitai)
if "civitai" in updates and isinstance(updates["civitai"], dict):
gallery_civitai.update(updates["civitai"])
gallery_civitai["images"] = gallery
gallery_civitai["images"] = all_images
updates["civitai"] = gallery_civitai
# tags
@@ -159,6 +176,11 @@ class PostProcessor:
updates["llm_enriched_at"] = datetime.now(timezone.utc).isoformat()
preview_remote_url = (llm_output.get("preview_url") or "").strip()
# Fallback: if the LLM couldn't find a preview image in the cleaned
# README, use the first gallery image extracted from the YAML widget
# section.
if not preview_remote_url and gallery_images:
preview_remote_url = gallery_images[0].get("url", "")
current_preview = metadata.get("preview_url") or ""
if preview_remote_url and not (current_preview and os.path.exists(current_preview)):
local_path = await download_preview(model_path, preview_remote_url)
@@ -166,6 +188,22 @@ class PostProcessor:
preview_downloaded = True
updates["preview_url"] = local_path
# notes — plain-text summary of usage info from the LLM
new_notes = (llm_output.get("notes") or "").strip()
if new_notes:
updates["notes"] = new_notes
# usage_tips — JSON string (e.g. {"strength_min":0.85,"strength_max":1.4})
raw_tips = (llm_output.get("usage_tips") or "").strip()
if raw_tips and raw_tips != "{}":
try:
json.loads(raw_tips)
updates["usage_tips"] = raw_tips
except (json.JSONDecodeError, TypeError):
logger.warning(
"LLM returned invalid usage_tips JSON: %s", raw_tips[:200]
)
if updates:
updated_fields = await apply_metadata_updates(model_path, updates)

View File

@@ -84,6 +84,25 @@ The recommended image generation resolution for this model, in pixels. Look for
### preview_url
The URL of the most suitable preview image from the README. Look for image tags (e.g. `![alt](url)`) and the YAML frontmatter `widget:` section (which often has `output.url` fields). Choose the first image that appears to be a generation example (not a logo or diagram). Construct the absolute URL as `https://huggingface.co/{{repo}}/resolve/main/{filename}`. If no suitable image is found, return an empty string.
### notes
A plain-text summary of the model card's key practical usage information. Combine trigger words, style modifiers, recommended parameters (steps, CFG, resolution, sampler), and any setup tips into a readable paragraph. Return empty string if the README has no useful usage info.
### usage_tips
A JSON string with structured usage recommendations. Extract from the README any explicit ranges or recommended values (e.g. "Set LoRA strength: **0.85 - 1.4**", "CLIP strength: 0.5"). Possible fields (include only those you can determine):
```json
{
"strength_min": 0.85,
"strength_max": 1.4,
"strength_range": "0.85-1.4",
"strength": 0.6,
"clip_strength": 0.5,
"clip_skip": 2
}
```
Return the JSON string (e.g. `'{"strength_min":0.85,"strength_max":1.4}'`). Return `"{}"` if nothing useful is found.
### confidence
Your confidence level in the extracted data:
- "high" — most fields were explicitly stated in the README
@@ -104,6 +123,8 @@ Return ONLY a JSON object with exactly these fields (no markdown fences, no extr
"recommended_width": 768,
"recommended_height": 1024,
"preview_url": "<image URL or empty string>",
"notes": "<plain-text usage summary or empty string>",
"usage_tips": "<JSON string like '{\"strength_min\":0.85,\"strength_max\":1.4}' or '{}'>",
"confidence": "<high|medium|low>"
}
```

View File

@@ -1,8 +1,13 @@
"""Inline markdown-to-HTML converter for HF README content.
"""Inline markdown-to-HTML converter and LLM-prompt cleaner for HF README content.
No external dependencies. Strips YAML frontmatter, ``<Gallery />`` sections,
badge images, and HTML comments before rendering. Only used by the
``enrich_hf_metadata`` skill.
Also provides :func:`clean_readme_for_llm` which pre-processes the raw README
before it is injected into the LLM prompt, removing content that has zero value
for metadata extraction (widget sections, code blocks, training tables,
boilerplate, massive lists, etc.).
"""
from __future__ import annotations
@@ -118,6 +123,88 @@ def extract_gallery_images(
return images
def extract_gallery_table_images(
markdown_text: str,
repo: str,
existing_urls: set | None = None,
default_width: int = 512,
default_height: int = 512,
) -> list[dict]:
"""Extract images from ``| Preview | Prompt |`` markdown gallery tables.
Many HF READMEs include a sample-gallery table in the body (outside
the YAML frontmatter) that shows generation examples with their
prompts. This function parses those tables and merges results with
the widget-sourced images from :func:`extract_gallery_images`.
Returns a list of dicts in the same ``civitai.images`` format as
:func:`extract_gallery_images`. Already-seen URLs (from *existing_urls*)
are skipped.
"""
if not markdown_text or not repo:
return []
base_url = f"https://huggingface.co/{repo}/resolve/main"
images: list[dict] = []
seen_urls: set = set(existing_urls) if existing_urls else set()
lines = markdown_text.split("\n")
n = len(lines)
i = 0
while i < n:
line = lines[i]
if "|" not in line or i + 1 >= n:
i += 1
continue
# Check for table separator row
if not re.match(r"^\|[\s:-]+\|", lines[i + 1]):
i += 1
continue
header_lower = line.strip().lower()
first_cell = header_lower.strip("|").split("|")[0].strip() if "|" in header_lower else ""
is_gallery = any(kw in first_cell for kw in ("preview", "sample", "gallery", "image", "thumbnail"))
if not is_gallery:
i += 1
continue
# Skip header + separator
i += 2
while i < n and "|" in lines[i]:
cells = [c.strip() for c in lines[i].strip().strip("|").split("|")]
if len(cells) >= 2:
first = cells[0]
prompt = cells[1]
url_match = re.search(r"!\[([^\]]*)\]\(([^)]+)\)", first)
if url_match:
raw_path = url_match.group(2)
if raw_path.startswith("http"):
url = raw_path
else:
# Normalise: remove leading / and ./ prefixes
clean = raw_path.lstrip("./").lstrip("/")
url = f"{base_url}/{clean}"
if url not in seen_urls:
seen_urls.add(url)
images.append({
"url": url,
"type": "image",
"nsfwLevel": 0,
"width": default_width,
"height": default_height,
"meta": {"prompt": prompt, "negativePrompt": ""},
"hasMeta": bool(prompt),
"hasPositivePrompt": bool(prompt),
})
i += 1
continue
return images
def _extract_frontmatter(text: str) -> str:
"""Return the YAML frontmatter content (without the ``---`` delimiters).
@@ -145,7 +232,260 @@ def convert_readme_to_html(markdown_text: str | None) -> str:
# ---------------------------------------------------------------------------
# Pre-processing: strip unwanted sections
# README cleaning for LLM prompt injection
# ---------------------------------------------------------------------------
#: Section headers that signal boilerplate content with zero metadata value.
_BOILERPLATE_HEADERS: tuple[str, ...] = (
"download model",
"license",
"citation",
"links",
"disclaimer",
"architecture notes",
"training details",
"dataset",
"provenance",
)
#: Table header keywords that identify training-parameter tables.
_TRAINING_PARAM_KEYWORDS: tuple[str, ...] = (
"lr scheduler",
"optimizer",
"network dim",
"network alpha",
"noise offset",
"multires noise",
"repeat",
"epoch",
"batch size",
"gradient accumulation",
"learning rate",
"rslora",
"dtype",
)
#: Maximum chars before a single-line comma list is considered massive.
_MASSIVE_LIST_LINE_MIN_LEN = 150
#: Minimum consecutive enumeration lines to trigger massive-list stripping.
_MASSIVE_LIST_THRESHOLD = 8
def clean_readme_for_llm(markdown_text: str | None, max_length: int = 6000) -> str:
"""Clean a HF README for injection into an LLM metadata-extraction prompt.
Removes content that carries no signal for inferring base model,
trigger words, short description, tags, or a preview image URL:
* ``widget:`` YAML block (example prompts + output URLs)
* ``<Gallery />`` tags and wrappers
* Fenced code blocks (Python / bash / bibtex / yaml)
* Standalone ``![...](...)`` image lines and ``<img>`` tags
* Training-parameter tables
* Boilerplate sections (Download / License / Citation / …)
* Massive enumeration lists (e.g. 3000+ celebrity names)
The post-processor still receives the **full** raw README via
``readme_content_full``, so nothing is lost for HTML conversion or
gallery-image extraction.
Args:
markdown_text: Raw README.md content from HuggingFace.
max_length: Hard ceiling on output length (default 6 000 chars).
Returns:
Cleaned markdown, truncated to *max_length*.
"""
if not markdown_text:
return ""
text = markdown_text
# Order matters — broader strips first, then finer ones.
text = _strip_gallery(text)
text = _strip_fenced_code_blocks(text)
text = _strip_standalone_images(text)
text = _strip_training_tables(text)
text = _strip_boilerplate_sections(text)
text = _strip_massive_lists(text)
text = _strip_badge_images(text)
text = _strip_html_comments(text)
text = _compress_blank_lines(text)
if len(text) > max_length:
text = text[:max_length]
return text.strip()
def _strip_fenced_code_blocks(text: str) -> str:
"""Strip fenced code blocks that have an explicit programming-language tag.
Blocks without a language tag (just `` ``` ``) are preserved — they
often contain trigger words, example prompts, or config snippets
rather than actual runnable code.
"""
# Match opening ``` immediately followed by a word character (the language
# tag), then any content, then closing ```. Plain ``` at the start of a
# line is left intact. A leading \n is optional (handles blocks at the
# start of the text).
return re.sub(
r"(?:\n|^)```[a-zA-Z_][a-zA-Z0-9_]*\s*\n.*?\n```",
"",
text,
flags=re.DOTALL,
)
def _strip_standalone_images(text: str) -> str:
"""Strip image embeds that occupy their own line.
Preserves the alt text from markdown images (``![alt](url)`` → ``alt``)
since it often describes what the model generates, which is useful signal
for tag/description extraction.
"""
# Markdown: ``![alt](url)`` on its own line → keep alt text
text = re.sub(
r"^\s*!\[([^\]]*)\]\([^)]+\)\s*$",
r"\1",
text,
flags=re.MULTILINE,
)
# HTML: ``<img src="..." ...>`` on its own line → remove entirely
text = re.sub(
r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
"",
text,
flags=re.MULTILINE | re.IGNORECASE,
)
return text
def _strip_training_tables(text: str) -> str:
"""Strip markdown tables whose header row mentions training parameters.
Checks the header row (first line of a detected table) against
``_TRAINING_PARAM_KEYWORDS``. Non-training tables (e.g. "Best
Dimensions") are preserved.
"""
lines = text.split("\n")
out: list[str] = []
i = 0
n = len(lines)
while i < n:
line = lines[i]
if "|" in line and i + 1 < n and re.match(r"^\|[\s:-]+\|", lines[i + 1]):
table_lines = [line]
i += 1
while i < n and "|" in lines[i]:
table_lines.append(lines[i])
i += 1
# Check header + first data row for training keywords
header_and_first = (line + "\n" + (table_lines[2] if len(table_lines) > 2 else "")).lower()
if any(kw in header_and_first for kw in _TRAINING_PARAM_KEYWORDS):
continue
out.extend(table_lines)
else:
out.append(line)
i += 1
return "\n".join(out)
def _strip_boilerplate_sections(text: str) -> str:
"""Strip sections whose headings match known boilerplate patterns.
When a heading (``## Download model``, ``## License``, etc.) is
detected, the heading and all content until the next heading of
equal-or-higher level is removed.
"""
lines = text.split("\n")
out: list[str] = []
i = 0
n = len(lines)
skip_until_level: int | None = None
while i < n:
line = lines[i]
h_match = re.match(r"^(#{1,4})\s+(.+?)\s*#*$", line)
if h_match:
level = len(h_match.group(1))
title = h_match.group(2).strip().lower()
is_boilerplate = any(
title == kw or title.startswith(kw + " ") or title.startswith(kw + ":")
for kw in _BOILERPLATE_HEADERS
)
if is_boilerplate:
skip_until_level = level
i += 1
continue
if skip_until_level is not None and level <= skip_until_level:
skip_until_level = None
if skip_until_level is None:
out.append(line)
i += 1
return "\n".join(out)
def _strip_massive_lists(text: str) -> str:
"""Strip blocks of 8+ consecutive enumeration-style lines.
Targets long comma-separated name lists (e.g. the 3000+ celebrity
names in some Z-Image READMEs) and dense bullet enumerations.
"""
lines = text.split("\n")
out: list[str] = []
i = 0
n = len(lines)
while i < n:
stripped = lines[i].strip()
# A "list-like" line ends with comma or is a bullet with commas
is_list_like = bool(stripped) and (
stripped.endswith(",")
or len(stripped) >= _MASSIVE_LIST_LINE_MIN_LEN
or (bool(re.match(r"^[-*+]\s", stripped)) and "," in stripped)
)
if is_list_like:
count = 1
j = i + 1
while j < n:
s = lines[j].strip()
if not s:
j += 1
continue
if s.endswith(",") or (bool(re.match(r"^[-*+]\s", s)) and "," in s):
count += 1
j += 1
else:
break
if count >= _MASSIVE_LIST_THRESHOLD:
i = j
continue
out.append(lines[i])
i += 1
return "\n".join(out)
def _compress_blank_lines(text: str) -> str:
"""Collapse runs of 3+ blank lines down to 2."""
return re.sub(r"\n{3,}", "\n\n", text)
# ---------------------------------------------------------------------------
# Pre-processing: strip unwanted sections (HTML conversion helpers)
# ---------------------------------------------------------------------------

View File

@@ -333,18 +333,53 @@ class LLMService:
cfg = self._ensure_configured()
api_base = self._resolve_api_base(cfg["provider"], cfg["api_base"])
url = f"{api_base}/chat/completions"
model_name = model or cfg["model"]
payload: Dict[str, Any] = {
"model": model_name,
"messages": messages,
"temperature": temperature,
}
if response_format is not None:
payload["response_format"] = response_format
if max_tokens is not None:
payload["max_tokens"] = max_tokens
is_ollama = cfg["provider"] == "ollama"
if is_ollama:
# Use Ollama's native /api/chat endpoint which does NOT expose
# a separate reasoning/thinking field (the model's full output
# lands directly in message.content). The OpenAI-compatible
# endpoint splits thinking into the "reasoning" field, making
# content empty when thinking consumes all available tokens.
base = api_base.rstrip("/")
if base.endswith("/v1"):
base = base[:-3]
url = f"{base}/api/chat"
else:
url = f"{api_base}/chat/completions"
payload: Dict[str, Any]
if is_ollama:
payload = {
"model": model_name,
"messages": messages,
"stream": False,
# Suppress separate thinking trace — thinking still happens
# internally (accuracy preserved) but output goes directly to
# message.content instead of being split across content +
# thinking. Without this the model can exhaust num_predict
# on thinking alone and leave content empty.
"think": False,
"options": {
"temperature": temperature,
},
}
if response_format is not None:
payload["format"] = "json"
if max_tokens is not None:
payload["options"]["num_predict"] = max_tokens
else:
payload = {
"model": model_name,
"messages": messages,
"temperature": temperature,
}
if response_format is not None:
payload["response_format"] = response_format
if max_tokens is not None:
payload["max_tokens"] = max_tokens
headers = self._build_headers(cfg["api_key"])
@@ -387,8 +422,25 @@ class LLMService:
# Parse response
try:
content = data["choices"][0]["message"]["content"]
usage = data.get("usage", {})
if is_ollama:
content = (data.get("message") or {}).get("content") or ""
usage = {"completion_tokens": data.get("eval_count", 0)}
finish_reason = data.get("done_reason", "")
if not content:
logger.warning(
"LLM returned empty content. Provider=ollama, "
"done_reason=%s, eval_count=%s",
finish_reason,
data.get("eval_count", 0),
)
else:
content = data["choices"][0]["message"].get("content") or ""
usage = data.get("usage", {})
if not content:
logger.warning(
"LLM returned empty content. Full response truncated: %s",
json.dumps(data, ensure_ascii=False)[:1000],
)
return {
"content": content,
"usage": usage,
@@ -442,13 +494,16 @@ class LLMService:
{"role": "user", "content": user_prompt},
]
# First attempt with JSON mode
# First attempt with JSON mode.
# Use a generous max_tokens so thinking-enabled models (e.g.
# gemma4 via Ollama) have room to reason AND still emit content.
effective_max = max_tokens or 131072
result = await self.chat_completion(
messages=messages,
model=model,
temperature=temperature,
response_format={"type": "json_object"},
max_tokens=max_tokens,
max_tokens=effective_max,
)
try:
@@ -458,11 +513,15 @@ class LLMService:
"LLM JSON parse failed on first attempt: %s. Retrying.", exc
)
# Retry with explicit instruction to return valid JSON
# Retry WITHOUT response_format — some providers (Ollama with
# thinking-enabled models like gemma4) may return empty content
# when json_object mode is active. Fall back to a textual
# instruction instead.
previous_content = result.get("content", "") or ""
retry_messages = messages + [
{
"role": "assistant",
"content": result["content"],
"content": previous_content or "(empty response)",
},
{
"role": "user",
@@ -478,14 +537,21 @@ class LLMService:
messages=retry_messages,
model=model,
temperature=0.0, # More deterministic for retry
response_format={"type": "json_object"},
max_tokens=max_tokens,
max_tokens=effective_max,
)
try:
return json.loads(result["content"])
except (json.JSONDecodeError, TypeError) as exc:
content = result.get("content", "") or ""
if not content:
raise LLMResponseError(
f"LLM response could not be parsed as JSON after retry: {exc}\n"
f"Raw content: {result['content'][:500]}"
) from exc
"LLM response could not be parsed as JSON after retry: "
f"Expecting value: line 1 column 1 (char 0)\n"
f"Raw content: {content[:500]}"
)
try:
return json.loads(content)
except (json.JSONDecodeError, TypeError) as parse_err:
raise LLMResponseError(
f"LLM response could not be parsed as JSON after retry: {parse_err}\n"
f"Raw content: {content[:500]}"
) from parse_err

View File

@@ -583,3 +583,443 @@ widget:
assert len(images) == 1
assert "two samurais doing a muay thai fight" in images[0]["meta"]["prompt"]
assert "Textured abstract style" in images[0]["meta"]["prompt"]
# ======================================================================
# extract_gallery_table_images — Sample Gallery markdown tables
# ======================================================================
class TestExtractGalleryTableImages:
_REPO = "Limbicnation/pixel-art-lora"
_README = """## Sample Gallery
| Preview | Prompt |
|---------|--------|
| ![Knight](./samples/knight.png) | pixel art sprite, a brave knight |
| ![Dragon](./samples/dragon.png) | pixel art sprite, a fire dragon |
"""
@staticmethod
def _extract(md: str, repo: str = _REPO, existing: set | None = None):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
extract_gallery_table_images
return extract_gallery_table_images(md, repo, existing_urls=existing)
def test_extracts_table_images(self):
images = self._extract(self._README)
assert len(images) == 2
assert "knight.png" in images[0]["url"]
assert images[0]["meta"]["prompt"] == "pixel art sprite, a brave knight"
assert "dragon.png" in images[1]["url"]
def test_skips_existing_urls(self):
existing = {"https://huggingface.co/Limbicnation/pixel-art-lora/resolve/main/samples/knight.png"}
images = self._extract(self._README, existing=existing)
assert len(images) == 1
assert "knight.png" not in images[0]["url"]
def test_empty_readme_returns_empty(self):
assert self._extract("") == []
def test_no_gallery_table_returns_empty(self):
md = "## Description\nSome text."
assert self._extract(md) == []
def test_non_gallery_table_skipped(self):
md = "| Param | Value |\n|---|---|\n| Steps | 4 |"
assert self._extract(md) == []
def test_absolute_url_preserved(self):
md = "| Preview | Prompt |\n|---|---|\n| ![img](https://cdn.example.com/img.png) | text |"
images = self._extract(md, repo="user/repo")
assert len(images) == 1
assert images[0]["url"] == "https://cdn.example.com/img.png"
# ======================================================================
# clean_readme_for_llm — pre-process README before LLM injection
# ======================================================================
class TestCleanReadmeForLlm:
@staticmethod
def _clean(md: str, max_length: int = 6000) -> str:
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
clean_readme_for_llm
return clean_readme_for_llm(md, max_length=max_length)
# -- basic guards --------------------------------------------------------
def test_none_returns_empty(self):
assert self._clean(None) == "" # type: ignore[arg-type]
def test_empty_returns_empty(self):
assert self._clean("") == ""
def test_plain_text_passes_through(self):
result = self._clean("Just some description text.")
assert "Just some description text." in result
# -- widget section stripping -------------------------------------------
def test_widget_text_preserved_in_cleaned_output(self):
"""Widget section text is preserved — it provides useful signal
for tag and description extraction (example prompts describe what
the model generates)."""
md = """---
tags:
- lora
- anime
widget:
- text: "a test prompt"
output:
url: images/test.png
- text: >-
another long
prompt here
output:
url: images/test2.png
base_model: black-forest-labs/FLUX.1-dev
instance_prompt: trigger word
---
# Model Description
This is the actual content.
"""
result = self._clean(md)
# Widget text content preserved (valuable signal for tags)
# YAML folded scalars (``>-``) may split text across lines
assert "a test prompt" in result
assert "another long" in result
assert "prompt here" in result
# Non-widget frontmatter preserved
assert "base_model: black-forest-labs/FLUX.1-dev" in result
assert "instance_prompt: trigger word" in result
assert "tags:" in result
assert "- lora" in result
assert "- anime" in result
assert "Model Description" in result
def test_widget_last_key_in_frontmatter(self):
"""Widget text at end of frontmatter is preserved."""
md = """---
tags:
- lora
widget:
- output:
url: img.png
text: prompt
---
# Content
"""
result = self._clean(md)
assert "prompt" in result
assert "tags:" in result
def test_no_widget_untouched(self):
md = """---
tags:
- lora
base_model: flux
---
# Content
"""
result = self._clean(md)
assert "tags:" in result
assert "base_model: flux" in result
# -- gallery stripping ---------------------------------------------------
def test_gallery_tag_stripped(self):
md = "Some text\n<Gallery />\nmore text"
result = self._clean(md)
assert "<Gallery" not in result
# -- code block stripping ------------------------------------------------
def test_fenced_code_block_stripped(self):
md = """## Usage
```python
import torch
pipe = DiffusionPipeline.from_pretrained('base')
```
## Description
Some text.
"""
result = self._clean(md)
assert "import torch" not in result
assert "DiffusionPipeline" not in result
assert "## Usage" in result
assert "## Description" in result
def test_bash_code_block_stripped(self):
md = """## Setup
```bash
pip install diffusers
huggingface-cli download repo
```
"""
result = self._clean(md)
assert "pip install" not in result
assert "## Setup" in result
def test_code_block_sections_remain_separated(self):
md = "## Install\n```bash\npip install x\n```\n\n## Usage\nSome text."
result = self._clean(md)
assert "pip install" not in result
assert "## Install" in result
assert "## Usage" in result
assert "Some text." in result
def test_unmarked_code_block_preserved(self):
"""Unmarked fenced code blocks (just ```) are kept since they
often contain trigger words rather than code."""
md = """### Trigger Words
Always include:
```
pixel art sprite, game asset, transparent background
```
"""
result = self._clean(md)
assert "pixel art sprite" in result
assert "game asset" in result
assert "transparent background" in result
def test_unmarked_code_block_with_python_preserved(self):
"""Even unmarked blocks with Python code are kept (false positive
accepted because trigger-word blocks are unmarked)."""
md = "## Setup\n```\nimport torch\nprint('hello')\n```\n## Desc\nText."
result = self._clean(md)
assert "import torch" in result
# -- standalone image stripping ------------------------------------------
def test_standalone_image_stripped(self):
md = "## Gallery\n![sample](https://cdn.hf.co/img.png)\n![another](https://cdn.hf.co/img2.png)\n\nSome text."
result = self._clean(md)
assert "cdn.hf.co" not in result
assert "sample" in result # alt text preserved
assert "another" in result # alt text preserved
assert "## Gallery" in result
assert "Some text." in result
def test_html_img_tag_stripped(self):
md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
result = self._clean(md)
assert "cdn.hf.co" not in result
assert "Description." in result
def test_inline_image_within_paragraph_preserved(self):
"""Inline images inside paragraphs are rare but shouldn't be stripped."""
md = "Click here ![icon](https://example.com/icon.png) for more info."
result = self._clean(md)
assert "Click here" in result
assert "for more info" in result
# -- training table stripping --------------------------------------------
def test_training_table_stripped(self):
md = """## Training
| Parameter | Value |
|---------------|----------|
| LR Scheduler | constant |
| Optimizer | AdamW |
| Network Dim | 64 |
## Best Dimensions
| Resolution | Status |
|-----------|---------|
| 768x1024 | Best |
"""
result = self._clean(md)
assert "LR Scheduler" not in result
assert "Optimizer" not in result
assert "Network Dim" not in result
# Normal table preserved
assert "Best Dimensions" in result
assert "768x1024" in result
def test_normal_table_preserved(self):
md = """## Recommended
| Resolution | Status |
|-----------|---------|
| 1024x1024 | Default |
"""
result = self._clean(md)
assert "1024x1024" in result
# -- boilerplate section stripping ---------------------------------------
def test_boilerplate_license_stripped(self):
md = """## Description
Some text.
## License
apache-2.0
Some license details here.
## More Content
After license.
"""
result = self._clean(md)
assert "apache-2.0" not in result
assert "## License" not in result
assert "## Description" in result
assert "## More Content" in result
assert "After license." in result
def test_boilerplate_disclaimer_stripped(self):
md = """## Description
Some text.
## DISCLAIMER
Legal text here.
## Citation
Bibtex here.
"""
result = self._clean(md)
assert "Legal text" not in result
assert "Bibtex" not in result
assert "Some text." in result
def test_boilerplate_subsection_not_stripped(self):
"""Only top-level (##) boilerplate is stripped; ### subsections inside
non-boilerplate headings are left alone."""
md = """## Usage
Some text.
### Important Note
This is a note within the usage section.
"""
result = self._clean(md)
assert "Important Note" in result
# -- massive list stripping ----------------------------------------------
def test_massive_name_list_stripped(self):
lines = ["## 2026 Updates:"]
for i in range(12):
lines.append(f"Name{i}A, Name{i}B, Name{i}C, Name{i}D, Name{i}E,")
lines.append("## License")
lines.append("apache")
md = "\n".join(lines)
result = self._clean(md)
assert "Name0A" not in result
assert "Name11E" not in result
assert "## 2026 Updates:" in result
# License stripped by boilerplate
assert "apache" not in result
def test_short_list_preserved(self):
"""Short lists (< 8 consecutive lines) should not be stripped."""
lines = ["## Tags:"]
for i in range(4):
lines.append(f"tag{i}A, tag{i}B,")
lines.append("## Description")
lines.append("Some text.")
md = "\n".join(lines)
result = self._clean(md)
assert "tag0A" in result
assert "tag3B" in result
# -- max_length truncation -----------------------------------------------
def test_truncation(self):
md = "A" * 100 + "\n" + "B" * 100
result = self._clean(md, max_length=150)
assert len(result) <= 150
assert result.startswith("A" * 100)
# -- integration: end-to-end realistic README ----------------------------
def test_realistic_flux_lora_readme(self):
md = """---
tags:
- text-to-image
- lora
- diffusers
- 3D
- Toon
widget:
- text: >-
Long toons, a close-up of a cartoon character face...
output:
url: images/LT4.png
- text: >-
Long toons, Super Detail, a close-up shot...
output:
url: images/LT5.png
base_model: black-forest-labs/FLUX.1-dev
instance_prompt: Long toons
license: creativeml-openrail-m
---
# Flux-Long-Toon-LoRA
<Gallery />
**The model is still in the training phase.**
## Model description
**prithivMLmods/Flux-Long-Toon-LoRA**
Image Processing Parameters
| Parameter | Value | Parameter | Value |
|---------------------------|--------|---------------------------|--------|
| LR Scheduler | constant | Noise Offset | 0.03 |
| Optimizer | AdamW | Multires Noise Discount | 0.1 |
| Network Dim | 64 | Multires Noise Iterations | 10 |
| Network Alpha | 32 | Repeat & Steps | 25 & 3270 |
| Epoch | 18 | Save Every N Epochs | 1 |
## Best Dimensions
- 768 x 1024 (Best)
- 1024 x 1024 (Default)
## Setting Up
```python
import torch
from pipelines import DiffusionPipeline
base_model = "black-forest-labs/FLUX.1-dev"
pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.bfloat16)
lora_repo = "prithivMLmods/Flux-Long-Toon-LoRA"
trigger_word = "Long toons"
pipe.load_lora_weights(lora_repo)
```
## Trigger words
You should use `Long toons` to trigger the image generation.
## Download model
Weights for this model are available in Safetensors format.
"""
original_len = len(md)
result = self._clean(md)
# Still significantly smaller (widget text is kept but training
# tables, code blocks, boilerplate are stripped)
assert len(result) < original_len * 0.7, (
f"Expected <70% of original, got {len(result)}/{original_len}"
)
# Signal preserved
assert "Long toons" in result
assert "black-forest-labs/FLUX.1-dev" in result
assert "3D" in result
assert "Toon" in result
# Widget content preserved (text is valuable signal for tags/desc)
assert "close-up of a cartoon character face" in result
assert "Super Detail" in result
# Noise stripped
assert "import torch" not in result
assert "DiffusionPipeline" not in result
assert "LR Scheduler" not in result
assert "<Gallery" not in result
assert "Download model" not in result