mirror of
https://github.com/willmiao/ComfyUI-Lora-Manager.git
synced 2026-07-05 17:01:16 -03:00
feat(agent): optimize enrich_hf_metadata with README cleaning, Ollama native API, and expanded fields
- Add clean_readme_for_llm() to strip noise from README before LLM injection - Keep widget section text (valuable tag signal) and unmarked code blocks (trigger words) - Preserve standalone image alt text instead of removing entirely - Switch Ollama to native /api/chat with think:false to fix empty content on thinking models - Extract Sample Gallery table images and deduplicate with widget images - Only strip code blocks with explicit language tags (bash) - Add notes and usage_tips fields to SKILL.md output format and post-processor - Clean up dead code, fix regex edge cases, remove double type annotation
This commit is contained in:
@@ -28,6 +28,7 @@ from ..llm_service import LLMService
|
|||||||
from ..websocket_manager import ws_manager
|
from ..websocket_manager import ws_manager
|
||||||
from .post_processor import PostProcessor
|
from .post_processor import PostProcessor
|
||||||
from .skill_registry import SkillRegistry
|
from .skill_registry import SkillRegistry
|
||||||
|
from .skills.enrich_hf_metadata.md_to_html import clean_readme_for_llm
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -368,7 +369,8 @@ class AgentService:
|
|||||||
context["repo"] = repo or ""
|
context["repo"] = repo or ""
|
||||||
if repo:
|
if repo:
|
||||||
readme = await self._fetch_readme(repo)
|
readme = await self._fetch_readme(repo)
|
||||||
context["readme_content"] = readme[:8000] if readme else "(README not available)"
|
cleaned = clean_readme_for_llm(readme) if readme else ""
|
||||||
|
context["readme_content"] = cleaned if cleaned else "(README not available)"
|
||||||
context["readme_content_full"] = readme or ""
|
context["readme_content_full"] = readme or ""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ refresh cache). All actual I/O is delegated to :mod:`~py.agent_cli`.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
@@ -79,6 +80,7 @@ class PostProcessor:
|
|||||||
from .skills.enrich_hf_metadata.md_to_html import (
|
from .skills.enrich_hf_metadata.md_to_html import (
|
||||||
convert_readme_to_html,
|
convert_readme_to_html,
|
||||||
extract_gallery_images,
|
extract_gallery_images,
|
||||||
|
extract_gallery_table_images,
|
||||||
extract_repo_from_hf_url,
|
extract_repo_from_hf_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -127,23 +129,38 @@ class PostProcessor:
|
|||||||
desc_civitai["description"] = short_desc
|
desc_civitai["description"] = short_desc
|
||||||
updates["civitai"] = desc_civitai
|
updates["civitai"] = desc_civitai
|
||||||
|
|
||||||
# gallery images → civitai.images (from YAML frontmatter widget entries)
|
# gallery images → civitai.images (from YAML frontmatter widget entries
|
||||||
|
# and Sample Gallery markdown tables in the README body)
|
||||||
|
gallery_images: List[Dict[str, Any]] = []
|
||||||
if readme_content and is_hf_model:
|
if readme_content and is_hf_model:
|
||||||
hf_url = metadata.get("hf_url", "") or ""
|
hf_url = metadata.get("hf_url", "") or ""
|
||||||
repo = extract_repo_from_hf_url(hf_url)
|
repo = extract_repo_from_hf_url(hf_url)
|
||||||
if repo:
|
if repo:
|
||||||
rec_w = llm_output.get("recommended_width") or 0
|
rec_w = llm_output.get("recommended_width") or 0
|
||||||
rec_h = llm_output.get("recommended_height") or 0
|
rec_h = llm_output.get("recommended_height") or 0
|
||||||
|
|
||||||
|
# 1. Widget images (YAML frontmatter)
|
||||||
gallery = extract_gallery_images(
|
gallery = extract_gallery_images(
|
||||||
readme_content, repo,
|
readme_content, repo,
|
||||||
default_width=rec_w, default_height=rec_h,
|
default_width=rec_w, default_height=rec_h,
|
||||||
)
|
)
|
||||||
if gallery:
|
|
||||||
|
# 2. Sample Gallery table images (markdown body), deduplicated
|
||||||
|
existing_urls = {img["url"] for img in gallery if img.get("url")}
|
||||||
|
table_images = extract_gallery_table_images(
|
||||||
|
readme_content, repo,
|
||||||
|
existing_urls=existing_urls,
|
||||||
|
default_width=rec_w, default_height=rec_h,
|
||||||
|
)
|
||||||
|
|
||||||
|
all_images = gallery + table_images
|
||||||
|
if all_images:
|
||||||
|
gallery_images = all_images
|
||||||
current_civitai = metadata.get("civitai") or {}
|
current_civitai = metadata.get("civitai") or {}
|
||||||
gallery_civitai = dict(current_civitai)
|
gallery_civitai = dict(current_civitai)
|
||||||
if "civitai" in updates and isinstance(updates["civitai"], dict):
|
if "civitai" in updates and isinstance(updates["civitai"], dict):
|
||||||
gallery_civitai.update(updates["civitai"])
|
gallery_civitai.update(updates["civitai"])
|
||||||
gallery_civitai["images"] = gallery
|
gallery_civitai["images"] = all_images
|
||||||
updates["civitai"] = gallery_civitai
|
updates["civitai"] = gallery_civitai
|
||||||
|
|
||||||
# tags
|
# tags
|
||||||
@@ -159,6 +176,11 @@ class PostProcessor:
|
|||||||
updates["llm_enriched_at"] = datetime.now(timezone.utc).isoformat()
|
updates["llm_enriched_at"] = datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
preview_remote_url = (llm_output.get("preview_url") or "").strip()
|
preview_remote_url = (llm_output.get("preview_url") or "").strip()
|
||||||
|
# Fallback: if the LLM couldn't find a preview image in the cleaned
|
||||||
|
# README, use the first gallery image extracted from the YAML widget
|
||||||
|
# section.
|
||||||
|
if not preview_remote_url and gallery_images:
|
||||||
|
preview_remote_url = gallery_images[0].get("url", "")
|
||||||
current_preview = metadata.get("preview_url") or ""
|
current_preview = metadata.get("preview_url") or ""
|
||||||
if preview_remote_url and not (current_preview and os.path.exists(current_preview)):
|
if preview_remote_url and not (current_preview and os.path.exists(current_preview)):
|
||||||
local_path = await download_preview(model_path, preview_remote_url)
|
local_path = await download_preview(model_path, preview_remote_url)
|
||||||
@@ -166,6 +188,22 @@ class PostProcessor:
|
|||||||
preview_downloaded = True
|
preview_downloaded = True
|
||||||
updates["preview_url"] = local_path
|
updates["preview_url"] = local_path
|
||||||
|
|
||||||
|
# notes — plain-text summary of usage info from the LLM
|
||||||
|
new_notes = (llm_output.get("notes") or "").strip()
|
||||||
|
if new_notes:
|
||||||
|
updates["notes"] = new_notes
|
||||||
|
|
||||||
|
# usage_tips — JSON string (e.g. {"strength_min":0.85,"strength_max":1.4})
|
||||||
|
raw_tips = (llm_output.get("usage_tips") or "").strip()
|
||||||
|
if raw_tips and raw_tips != "{}":
|
||||||
|
try:
|
||||||
|
json.loads(raw_tips)
|
||||||
|
updates["usage_tips"] = raw_tips
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
logger.warning(
|
||||||
|
"LLM returned invalid usage_tips JSON: %s", raw_tips[:200]
|
||||||
|
)
|
||||||
|
|
||||||
if updates:
|
if updates:
|
||||||
updated_fields = await apply_metadata_updates(model_path, updates)
|
updated_fields = await apply_metadata_updates(model_path, updates)
|
||||||
|
|
||||||
|
|||||||
@@ -84,6 +84,25 @@ The recommended image generation resolution for this model, in pixels. Look for
|
|||||||
### preview_url
|
### preview_url
|
||||||
The URL of the most suitable preview image from the README. Look for image tags (e.g. ``) and the YAML frontmatter `widget:` section (which often has `output.url` fields). Choose the first image that appears to be a generation example (not a logo or diagram). Construct the absolute URL as `https://huggingface.co/{{repo}}/resolve/main/{filename}`. If no suitable image is found, return an empty string.
|
The URL of the most suitable preview image from the README. Look for image tags (e.g. ``) and the YAML frontmatter `widget:` section (which often has `output.url` fields). Choose the first image that appears to be a generation example (not a logo or diagram). Construct the absolute URL as `https://huggingface.co/{{repo}}/resolve/main/{filename}`. If no suitable image is found, return an empty string.
|
||||||
|
|
||||||
|
### notes
|
||||||
|
A plain-text summary of the model card's key practical usage information. Combine trigger words, style modifiers, recommended parameters (steps, CFG, resolution, sampler), and any setup tips into a readable paragraph. Return empty string if the README has no useful usage info.
|
||||||
|
|
||||||
|
### usage_tips
|
||||||
|
A JSON string with structured usage recommendations. Extract from the README any explicit ranges or recommended values (e.g. "Set LoRA strength: **0.85 - 1.4**", "CLIP strength: 0.5"). Possible fields (include only those you can determine):
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"strength_min": 0.85,
|
||||||
|
"strength_max": 1.4,
|
||||||
|
"strength_range": "0.85-1.4",
|
||||||
|
"strength": 0.6,
|
||||||
|
"clip_strength": 0.5,
|
||||||
|
"clip_skip": 2
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Return the JSON string (e.g. `'{"strength_min":0.85,"strength_max":1.4}'`). Return `"{}"` if nothing useful is found.
|
||||||
|
|
||||||
### confidence
|
### confidence
|
||||||
Your confidence level in the extracted data:
|
Your confidence level in the extracted data:
|
||||||
- "high" — most fields were explicitly stated in the README
|
- "high" — most fields were explicitly stated in the README
|
||||||
@@ -104,6 +123,8 @@ Return ONLY a JSON object with exactly these fields (no markdown fences, no extr
|
|||||||
"recommended_width": 768,
|
"recommended_width": 768,
|
||||||
"recommended_height": 1024,
|
"recommended_height": 1024,
|
||||||
"preview_url": "<image URL or empty string>",
|
"preview_url": "<image URL or empty string>",
|
||||||
|
"notes": "<plain-text usage summary or empty string>",
|
||||||
|
"usage_tips": "<JSON string like '{\"strength_min\":0.85,\"strength_max\":1.4}' or '{}'>",
|
||||||
"confidence": "<high|medium|low>"
|
"confidence": "<high|medium|low>"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -1,8 +1,13 @@
|
|||||||
"""Inline markdown-to-HTML converter for HF README content.
|
"""Inline markdown-to-HTML converter and LLM-prompt cleaner for HF README content.
|
||||||
|
|
||||||
No external dependencies. Strips YAML frontmatter, ``<Gallery />`` sections,
|
No external dependencies. Strips YAML frontmatter, ``<Gallery />`` sections,
|
||||||
badge images, and HTML comments before rendering. Only used by the
|
badge images, and HTML comments before rendering. Only used by the
|
||||||
``enrich_hf_metadata`` skill.
|
``enrich_hf_metadata`` skill.
|
||||||
|
|
||||||
|
Also provides :func:`clean_readme_for_llm` which pre-processes the raw README
|
||||||
|
before it is injected into the LLM prompt, removing content that has zero value
|
||||||
|
for metadata extraction (widget sections, code blocks, training tables,
|
||||||
|
boilerplate, massive lists, etc.).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -118,6 +123,88 @@ def extract_gallery_images(
|
|||||||
return images
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def extract_gallery_table_images(
|
||||||
|
markdown_text: str,
|
||||||
|
repo: str,
|
||||||
|
existing_urls: set | None = None,
|
||||||
|
default_width: int = 512,
|
||||||
|
default_height: int = 512,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Extract images from ``| Preview | Prompt |`` markdown gallery tables.
|
||||||
|
|
||||||
|
Many HF READMEs include a sample-gallery table in the body (outside
|
||||||
|
the YAML frontmatter) that shows generation examples with their
|
||||||
|
prompts. This function parses those tables and merges results with
|
||||||
|
the widget-sourced images from :func:`extract_gallery_images`.
|
||||||
|
|
||||||
|
Returns a list of dicts in the same ``civitai.images`` format as
|
||||||
|
:func:`extract_gallery_images`. Already-seen URLs (from *existing_urls*)
|
||||||
|
are skipped.
|
||||||
|
"""
|
||||||
|
if not markdown_text or not repo:
|
||||||
|
return []
|
||||||
|
|
||||||
|
base_url = f"https://huggingface.co/{repo}/resolve/main"
|
||||||
|
images: list[dict] = []
|
||||||
|
seen_urls: set = set(existing_urls) if existing_urls else set()
|
||||||
|
lines = markdown_text.split("\n")
|
||||||
|
n = len(lines)
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
while i < n:
|
||||||
|
line = lines[i]
|
||||||
|
if "|" not in line or i + 1 >= n:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for table separator row
|
||||||
|
if not re.match(r"^\|[\s:-]+\|", lines[i + 1]):
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
header_lower = line.strip().lower()
|
||||||
|
first_cell = header_lower.strip("|").split("|")[0].strip() if "|" in header_lower else ""
|
||||||
|
is_gallery = any(kw in first_cell for kw in ("preview", "sample", "gallery", "image", "thumbnail"))
|
||||||
|
if not is_gallery:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip header + separator
|
||||||
|
i += 2
|
||||||
|
while i < n and "|" in lines[i]:
|
||||||
|
cells = [c.strip() for c in lines[i].strip().strip("|").split("|")]
|
||||||
|
if len(cells) >= 2:
|
||||||
|
first = cells[0]
|
||||||
|
prompt = cells[1]
|
||||||
|
|
||||||
|
url_match = re.search(r"!\[([^\]]*)\]\(([^)]+)\)", first)
|
||||||
|
if url_match:
|
||||||
|
raw_path = url_match.group(2)
|
||||||
|
if raw_path.startswith("http"):
|
||||||
|
url = raw_path
|
||||||
|
else:
|
||||||
|
# Normalise: remove leading / and ./ prefixes
|
||||||
|
clean = raw_path.lstrip("./").lstrip("/")
|
||||||
|
url = f"{base_url}/{clean}"
|
||||||
|
|
||||||
|
if url not in seen_urls:
|
||||||
|
seen_urls.add(url)
|
||||||
|
images.append({
|
||||||
|
"url": url,
|
||||||
|
"type": "image",
|
||||||
|
"nsfwLevel": 0,
|
||||||
|
"width": default_width,
|
||||||
|
"height": default_height,
|
||||||
|
"meta": {"prompt": prompt, "negativePrompt": ""},
|
||||||
|
"hasMeta": bool(prompt),
|
||||||
|
"hasPositivePrompt": bool(prompt),
|
||||||
|
})
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
def _extract_frontmatter(text: str) -> str:
|
def _extract_frontmatter(text: str) -> str:
|
||||||
"""Return the YAML frontmatter content (without the ``---`` delimiters).
|
"""Return the YAML frontmatter content (without the ``---`` delimiters).
|
||||||
|
|
||||||
@@ -145,7 +232,260 @@ def convert_readme_to_html(markdown_text: str | None) -> str:
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Pre-processing: strip unwanted sections
|
# README cleaning for LLM prompt injection
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#: Section headers that signal boilerplate content with zero metadata value.
|
||||||
|
_BOILERPLATE_HEADERS: tuple[str, ...] = (
|
||||||
|
"download model",
|
||||||
|
"license",
|
||||||
|
"citation",
|
||||||
|
"links",
|
||||||
|
"disclaimer",
|
||||||
|
"architecture notes",
|
||||||
|
"training details",
|
||||||
|
"dataset",
|
||||||
|
"provenance",
|
||||||
|
)
|
||||||
|
|
||||||
|
#: Table header keywords that identify training-parameter tables.
|
||||||
|
_TRAINING_PARAM_KEYWORDS: tuple[str, ...] = (
|
||||||
|
"lr scheduler",
|
||||||
|
"optimizer",
|
||||||
|
"network dim",
|
||||||
|
"network alpha",
|
||||||
|
"noise offset",
|
||||||
|
"multires noise",
|
||||||
|
"repeat",
|
||||||
|
"epoch",
|
||||||
|
"batch size",
|
||||||
|
"gradient accumulation",
|
||||||
|
"learning rate",
|
||||||
|
"rslora",
|
||||||
|
"dtype",
|
||||||
|
)
|
||||||
|
|
||||||
|
#: Maximum chars before a single-line comma list is considered massive.
|
||||||
|
_MASSIVE_LIST_LINE_MIN_LEN = 150
|
||||||
|
#: Minimum consecutive enumeration lines to trigger massive-list stripping.
|
||||||
|
_MASSIVE_LIST_THRESHOLD = 8
|
||||||
|
|
||||||
|
|
||||||
|
def clean_readme_for_llm(markdown_text: str | None, max_length: int = 6000) -> str:
|
||||||
|
"""Clean a HF README for injection into an LLM metadata-extraction prompt.
|
||||||
|
|
||||||
|
Removes content that carries no signal for inferring base model,
|
||||||
|
trigger words, short description, tags, or a preview image URL:
|
||||||
|
|
||||||
|
* ``widget:`` YAML block (example prompts + output URLs)
|
||||||
|
* ``<Gallery />`` tags and wrappers
|
||||||
|
* Fenced code blocks (Python / bash / bibtex / yaml)
|
||||||
|
* Standalone ```` image lines and ``<img>`` tags
|
||||||
|
* Training-parameter tables
|
||||||
|
* Boilerplate sections (Download / License / Citation / …)
|
||||||
|
* Massive enumeration lists (e.g. 3000+ celebrity names)
|
||||||
|
|
||||||
|
The post-processor still receives the **full** raw README via
|
||||||
|
``readme_content_full``, so nothing is lost for HTML conversion or
|
||||||
|
gallery-image extraction.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
markdown_text: Raw README.md content from HuggingFace.
|
||||||
|
max_length: Hard ceiling on output length (default 6 000 chars).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cleaned markdown, truncated to *max_length*.
|
||||||
|
"""
|
||||||
|
if not markdown_text:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
text = markdown_text
|
||||||
|
|
||||||
|
# Order matters — broader strips first, then finer ones.
|
||||||
|
text = _strip_gallery(text)
|
||||||
|
text = _strip_fenced_code_blocks(text)
|
||||||
|
text = _strip_standalone_images(text)
|
||||||
|
text = _strip_training_tables(text)
|
||||||
|
text = _strip_boilerplate_sections(text)
|
||||||
|
text = _strip_massive_lists(text)
|
||||||
|
text = _strip_badge_images(text)
|
||||||
|
text = _strip_html_comments(text)
|
||||||
|
text = _compress_blank_lines(text)
|
||||||
|
|
||||||
|
if len(text) > max_length:
|
||||||
|
text = text[:max_length]
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_fenced_code_blocks(text: str) -> str:
|
||||||
|
"""Strip fenced code blocks that have an explicit programming-language tag.
|
||||||
|
|
||||||
|
Blocks without a language tag (just `` ``` ``) are preserved — they
|
||||||
|
often contain trigger words, example prompts, or config snippets
|
||||||
|
rather than actual runnable code.
|
||||||
|
"""
|
||||||
|
# Match opening ``` immediately followed by a word character (the language
|
||||||
|
# tag), then any content, then closing ```. Plain ``` at the start of a
|
||||||
|
# line is left intact. A leading \n is optional (handles blocks at the
|
||||||
|
# start of the text).
|
||||||
|
return re.sub(
|
||||||
|
r"(?:\n|^)```[a-zA-Z_][a-zA-Z0-9_]*\s*\n.*?\n```",
|
||||||
|
"",
|
||||||
|
text,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_standalone_images(text: str) -> str:
|
||||||
|
"""Strip image embeds that occupy their own line.
|
||||||
|
|
||||||
|
Preserves the alt text from markdown images (```` → ``alt``)
|
||||||
|
since it often describes what the model generates, which is useful signal
|
||||||
|
for tag/description extraction.
|
||||||
|
"""
|
||||||
|
# Markdown: ```` on its own line → keep alt text
|
||||||
|
text = re.sub(
|
||||||
|
r"^\s*!\[([^\]]*)\]\([^)]+\)\s*$",
|
||||||
|
r"\1",
|
||||||
|
text,
|
||||||
|
flags=re.MULTILINE,
|
||||||
|
)
|
||||||
|
# HTML: ``<img src="..." ...>`` on its own line → remove entirely
|
||||||
|
text = re.sub(
|
||||||
|
r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
|
||||||
|
"",
|
||||||
|
text,
|
||||||
|
flags=re.MULTILINE | re.IGNORECASE,
|
||||||
|
)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_training_tables(text: str) -> str:
|
||||||
|
"""Strip markdown tables whose header row mentions training parameters.
|
||||||
|
|
||||||
|
Checks the header row (first line of a detected table) against
|
||||||
|
``_TRAINING_PARAM_KEYWORDS``. Non-training tables (e.g. "Best
|
||||||
|
Dimensions") are preserved.
|
||||||
|
"""
|
||||||
|
lines = text.split("\n")
|
||||||
|
out: list[str] = []
|
||||||
|
i = 0
|
||||||
|
n = len(lines)
|
||||||
|
|
||||||
|
while i < n:
|
||||||
|
line = lines[i]
|
||||||
|
if "|" in line and i + 1 < n and re.match(r"^\|[\s:-]+\|", lines[i + 1]):
|
||||||
|
table_lines = [line]
|
||||||
|
i += 1
|
||||||
|
while i < n and "|" in lines[i]:
|
||||||
|
table_lines.append(lines[i])
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# Check header + first data row for training keywords
|
||||||
|
header_and_first = (line + "\n" + (table_lines[2] if len(table_lines) > 2 else "")).lower()
|
||||||
|
if any(kw in header_and_first for kw in _TRAINING_PARAM_KEYWORDS):
|
||||||
|
continue
|
||||||
|
out.extend(table_lines)
|
||||||
|
else:
|
||||||
|
out.append(line)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return "\n".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_boilerplate_sections(text: str) -> str:
|
||||||
|
"""Strip sections whose headings match known boilerplate patterns.
|
||||||
|
|
||||||
|
When a heading (``## Download model``, ``## License``, etc.) is
|
||||||
|
detected, the heading and all content until the next heading of
|
||||||
|
equal-or-higher level is removed.
|
||||||
|
"""
|
||||||
|
lines = text.split("\n")
|
||||||
|
out: list[str] = []
|
||||||
|
i = 0
|
||||||
|
n = len(lines)
|
||||||
|
skip_until_level: int | None = None
|
||||||
|
|
||||||
|
while i < n:
|
||||||
|
line = lines[i]
|
||||||
|
h_match = re.match(r"^(#{1,4})\s+(.+?)\s*#*$", line)
|
||||||
|
if h_match:
|
||||||
|
level = len(h_match.group(1))
|
||||||
|
title = h_match.group(2).strip().lower()
|
||||||
|
|
||||||
|
is_boilerplate = any(
|
||||||
|
title == kw or title.startswith(kw + " ") or title.startswith(kw + ":")
|
||||||
|
for kw in _BOILERPLATE_HEADERS
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_boilerplate:
|
||||||
|
skip_until_level = level
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if skip_until_level is not None and level <= skip_until_level:
|
||||||
|
skip_until_level = None
|
||||||
|
|
||||||
|
if skip_until_level is None:
|
||||||
|
out.append(line)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return "\n".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_massive_lists(text: str) -> str:
|
||||||
|
"""Strip blocks of 8+ consecutive enumeration-style lines.
|
||||||
|
|
||||||
|
Targets long comma-separated name lists (e.g. the 3000+ celebrity
|
||||||
|
names in some Z-Image READMEs) and dense bullet enumerations.
|
||||||
|
"""
|
||||||
|
lines = text.split("\n")
|
||||||
|
out: list[str] = []
|
||||||
|
i = 0
|
||||||
|
n = len(lines)
|
||||||
|
|
||||||
|
while i < n:
|
||||||
|
stripped = lines[i].strip()
|
||||||
|
|
||||||
|
# A "list-like" line ends with comma or is a bullet with commas
|
||||||
|
is_list_like = bool(stripped) and (
|
||||||
|
stripped.endswith(",")
|
||||||
|
or len(stripped) >= _MASSIVE_LIST_LINE_MIN_LEN
|
||||||
|
or (bool(re.match(r"^[-*+]\s", stripped)) and "," in stripped)
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_list_like:
|
||||||
|
count = 1
|
||||||
|
j = i + 1
|
||||||
|
while j < n:
|
||||||
|
s = lines[j].strip()
|
||||||
|
if not s:
|
||||||
|
j += 1
|
||||||
|
continue
|
||||||
|
if s.endswith(",") or (bool(re.match(r"^[-*+]\s", s)) and "," in s):
|
||||||
|
count += 1
|
||||||
|
j += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
if count >= _MASSIVE_LIST_THRESHOLD:
|
||||||
|
i = j
|
||||||
|
continue
|
||||||
|
|
||||||
|
out.append(lines[i])
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return "\n".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
def _compress_blank_lines(text: str) -> str:
|
||||||
|
"""Collapse runs of 3+ blank lines down to 2."""
|
||||||
|
return re.sub(r"\n{3,}", "\n\n", text)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Pre-processing: strip unwanted sections (HTML conversion helpers)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -333,18 +333,53 @@ class LLMService:
|
|||||||
|
|
||||||
cfg = self._ensure_configured()
|
cfg = self._ensure_configured()
|
||||||
api_base = self._resolve_api_base(cfg["provider"], cfg["api_base"])
|
api_base = self._resolve_api_base(cfg["provider"], cfg["api_base"])
|
||||||
url = f"{api_base}/chat/completions"
|
|
||||||
model_name = model or cfg["model"]
|
model_name = model or cfg["model"]
|
||||||
|
|
||||||
payload: Dict[str, Any] = {
|
is_ollama = cfg["provider"] == "ollama"
|
||||||
"model": model_name,
|
|
||||||
"messages": messages,
|
if is_ollama:
|
||||||
"temperature": temperature,
|
# Use Ollama's native /api/chat endpoint which does NOT expose
|
||||||
}
|
# a separate reasoning/thinking field (the model's full output
|
||||||
if response_format is not None:
|
# lands directly in message.content). The OpenAI-compatible
|
||||||
payload["response_format"] = response_format
|
# endpoint splits thinking into the "reasoning" field, making
|
||||||
if max_tokens is not None:
|
# content empty when thinking consumes all available tokens.
|
||||||
payload["max_tokens"] = max_tokens
|
base = api_base.rstrip("/")
|
||||||
|
if base.endswith("/v1"):
|
||||||
|
base = base[:-3]
|
||||||
|
url = f"{base}/api/chat"
|
||||||
|
else:
|
||||||
|
url = f"{api_base}/chat/completions"
|
||||||
|
|
||||||
|
payload: Dict[str, Any]
|
||||||
|
if is_ollama:
|
||||||
|
payload = {
|
||||||
|
"model": model_name,
|
||||||
|
"messages": messages,
|
||||||
|
"stream": False,
|
||||||
|
# Suppress separate thinking trace — thinking still happens
|
||||||
|
# internally (accuracy preserved) but output goes directly to
|
||||||
|
# message.content instead of being split across content +
|
||||||
|
# thinking. Without this the model can exhaust num_predict
|
||||||
|
# on thinking alone and leave content empty.
|
||||||
|
"think": False,
|
||||||
|
"options": {
|
||||||
|
"temperature": temperature,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if response_format is not None:
|
||||||
|
payload["format"] = "json"
|
||||||
|
if max_tokens is not None:
|
||||||
|
payload["options"]["num_predict"] = max_tokens
|
||||||
|
else:
|
||||||
|
payload = {
|
||||||
|
"model": model_name,
|
||||||
|
"messages": messages,
|
||||||
|
"temperature": temperature,
|
||||||
|
}
|
||||||
|
if response_format is not None:
|
||||||
|
payload["response_format"] = response_format
|
||||||
|
if max_tokens is not None:
|
||||||
|
payload["max_tokens"] = max_tokens
|
||||||
|
|
||||||
headers = self._build_headers(cfg["api_key"])
|
headers = self._build_headers(cfg["api_key"])
|
||||||
|
|
||||||
@@ -387,8 +422,25 @@ class LLMService:
|
|||||||
|
|
||||||
# Parse response
|
# Parse response
|
||||||
try:
|
try:
|
||||||
content = data["choices"][0]["message"]["content"]
|
if is_ollama:
|
||||||
usage = data.get("usage", {})
|
content = (data.get("message") or {}).get("content") or ""
|
||||||
|
usage = {"completion_tokens": data.get("eval_count", 0)}
|
||||||
|
finish_reason = data.get("done_reason", "")
|
||||||
|
if not content:
|
||||||
|
logger.warning(
|
||||||
|
"LLM returned empty content. Provider=ollama, "
|
||||||
|
"done_reason=%s, eval_count=%s",
|
||||||
|
finish_reason,
|
||||||
|
data.get("eval_count", 0),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
content = data["choices"][0]["message"].get("content") or ""
|
||||||
|
usage = data.get("usage", {})
|
||||||
|
if not content:
|
||||||
|
logger.warning(
|
||||||
|
"LLM returned empty content. Full response truncated: %s",
|
||||||
|
json.dumps(data, ensure_ascii=False)[:1000],
|
||||||
|
)
|
||||||
return {
|
return {
|
||||||
"content": content,
|
"content": content,
|
||||||
"usage": usage,
|
"usage": usage,
|
||||||
@@ -442,13 +494,16 @@ class LLMService:
|
|||||||
{"role": "user", "content": user_prompt},
|
{"role": "user", "content": user_prompt},
|
||||||
]
|
]
|
||||||
|
|
||||||
# First attempt with JSON mode
|
# First attempt with JSON mode.
|
||||||
|
# Use a generous max_tokens so thinking-enabled models (e.g.
|
||||||
|
# gemma4 via Ollama) have room to reason AND still emit content.
|
||||||
|
effective_max = max_tokens or 131072
|
||||||
result = await self.chat_completion(
|
result = await self.chat_completion(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
model=model,
|
model=model,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
response_format={"type": "json_object"},
|
response_format={"type": "json_object"},
|
||||||
max_tokens=max_tokens,
|
max_tokens=effective_max,
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -458,11 +513,15 @@ class LLMService:
|
|||||||
"LLM JSON parse failed on first attempt: %s. Retrying.", exc
|
"LLM JSON parse failed on first attempt: %s. Retrying.", exc
|
||||||
)
|
)
|
||||||
|
|
||||||
# Retry with explicit instruction to return valid JSON
|
# Retry WITHOUT response_format — some providers (Ollama with
|
||||||
|
# thinking-enabled models like gemma4) may return empty content
|
||||||
|
# when json_object mode is active. Fall back to a textual
|
||||||
|
# instruction instead.
|
||||||
|
previous_content = result.get("content", "") or ""
|
||||||
retry_messages = messages + [
|
retry_messages = messages + [
|
||||||
{
|
{
|
||||||
"role": "assistant",
|
"role": "assistant",
|
||||||
"content": result["content"],
|
"content": previous_content or "(empty response)",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -478,14 +537,21 @@ class LLMService:
|
|||||||
messages=retry_messages,
|
messages=retry_messages,
|
||||||
model=model,
|
model=model,
|
||||||
temperature=0.0, # More deterministic for retry
|
temperature=0.0, # More deterministic for retry
|
||||||
response_format={"type": "json_object"},
|
max_tokens=effective_max,
|
||||||
max_tokens=max_tokens,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
content = result.get("content", "") or ""
|
||||||
return json.loads(result["content"])
|
if not content:
|
||||||
except (json.JSONDecodeError, TypeError) as exc:
|
|
||||||
raise LLMResponseError(
|
raise LLMResponseError(
|
||||||
f"LLM response could not be parsed as JSON after retry: {exc}\n"
|
"LLM response could not be parsed as JSON after retry: "
|
||||||
f"Raw content: {result['content'][:500]}"
|
f"Expecting value: line 1 column 1 (char 0)\n"
|
||||||
) from exc
|
f"Raw content: {content[:500]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return json.loads(content)
|
||||||
|
except (json.JSONDecodeError, TypeError) as parse_err:
|
||||||
|
raise LLMResponseError(
|
||||||
|
f"LLM response could not be parsed as JSON after retry: {parse_err}\n"
|
||||||
|
f"Raw content: {content[:500]}"
|
||||||
|
) from parse_err
|
||||||
|
|||||||
@@ -583,3 +583,443 @@ widget:
|
|||||||
assert len(images) == 1
|
assert len(images) == 1
|
||||||
assert "two samurais doing a muay thai fight" in images[0]["meta"]["prompt"]
|
assert "two samurais doing a muay thai fight" in images[0]["meta"]["prompt"]
|
||||||
assert "Textured abstract style" in images[0]["meta"]["prompt"]
|
assert "Textured abstract style" in images[0]["meta"]["prompt"]
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# extract_gallery_table_images — Sample Gallery markdown tables
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractGalleryTableImages:
|
||||||
|
|
||||||
|
_REPO = "Limbicnation/pixel-art-lora"
|
||||||
|
_README = """## Sample Gallery
|
||||||
|
|
||||||
|
| Preview | Prompt |
|
||||||
|
|---------|--------|
|
||||||
|
|  | pixel art sprite, a brave knight |
|
||||||
|
|  | pixel art sprite, a fire dragon |
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract(md: str, repo: str = _REPO, existing: set | None = None):
|
||||||
|
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
||||||
|
extract_gallery_table_images
|
||||||
|
return extract_gallery_table_images(md, repo, existing_urls=existing)
|
||||||
|
|
||||||
|
def test_extracts_table_images(self):
|
||||||
|
images = self._extract(self._README)
|
||||||
|
assert len(images) == 2
|
||||||
|
assert "knight.png" in images[0]["url"]
|
||||||
|
assert images[0]["meta"]["prompt"] == "pixel art sprite, a brave knight"
|
||||||
|
assert "dragon.png" in images[1]["url"]
|
||||||
|
|
||||||
|
def test_skips_existing_urls(self):
|
||||||
|
existing = {"https://huggingface.co/Limbicnation/pixel-art-lora/resolve/main/samples/knight.png"}
|
||||||
|
images = self._extract(self._README, existing=existing)
|
||||||
|
assert len(images) == 1
|
||||||
|
assert "knight.png" not in images[0]["url"]
|
||||||
|
|
||||||
|
def test_empty_readme_returns_empty(self):
|
||||||
|
assert self._extract("") == []
|
||||||
|
|
||||||
|
def test_no_gallery_table_returns_empty(self):
|
||||||
|
md = "## Description\nSome text."
|
||||||
|
assert self._extract(md) == []
|
||||||
|
|
||||||
|
def test_non_gallery_table_skipped(self):
|
||||||
|
md = "| Param | Value |\n|---|---|\n| Steps | 4 |"
|
||||||
|
assert self._extract(md) == []
|
||||||
|
|
||||||
|
def test_absolute_url_preserved(self):
|
||||||
|
md = "| Preview | Prompt |\n|---|---|\n|  | text |"
|
||||||
|
images = self._extract(md, repo="user/repo")
|
||||||
|
assert len(images) == 1
|
||||||
|
assert images[0]["url"] == "https://cdn.example.com/img.png"
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# clean_readme_for_llm — pre-process README before LLM injection
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestCleanReadmeForLlm:
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean(md: str, max_length: int = 6000) -> str:
|
||||||
|
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
||||||
|
clean_readme_for_llm
|
||||||
|
return clean_readme_for_llm(md, max_length=max_length)
|
||||||
|
|
||||||
|
# -- basic guards --------------------------------------------------------
|
||||||
|
|
||||||
|
def test_none_returns_empty(self):
|
||||||
|
assert self._clean(None) == "" # type: ignore[arg-type]
|
||||||
|
|
||||||
|
def test_empty_returns_empty(self):
|
||||||
|
assert self._clean("") == ""
|
||||||
|
|
||||||
|
def test_plain_text_passes_through(self):
|
||||||
|
result = self._clean("Just some description text.")
|
||||||
|
assert "Just some description text." in result
|
||||||
|
|
||||||
|
# -- widget section stripping -------------------------------------------
|
||||||
|
|
||||||
|
def test_widget_text_preserved_in_cleaned_output(self):
|
||||||
|
"""Widget section text is preserved — it provides useful signal
|
||||||
|
for tag and description extraction (example prompts describe what
|
||||||
|
the model generates)."""
|
||||||
|
md = """---
|
||||||
|
tags:
|
||||||
|
- lora
|
||||||
|
- anime
|
||||||
|
widget:
|
||||||
|
- text: "a test prompt"
|
||||||
|
output:
|
||||||
|
url: images/test.png
|
||||||
|
- text: >-
|
||||||
|
another long
|
||||||
|
prompt here
|
||||||
|
output:
|
||||||
|
url: images/test2.png
|
||||||
|
base_model: black-forest-labs/FLUX.1-dev
|
||||||
|
instance_prompt: trigger word
|
||||||
|
---
|
||||||
|
# Model Description
|
||||||
|
This is the actual content.
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
# Widget text content preserved (valuable signal for tags)
|
||||||
|
# YAML folded scalars (``>-``) may split text across lines
|
||||||
|
assert "a test prompt" in result
|
||||||
|
assert "another long" in result
|
||||||
|
assert "prompt here" in result
|
||||||
|
# Non-widget frontmatter preserved
|
||||||
|
assert "base_model: black-forest-labs/FLUX.1-dev" in result
|
||||||
|
assert "instance_prompt: trigger word" in result
|
||||||
|
assert "tags:" in result
|
||||||
|
assert "- lora" in result
|
||||||
|
assert "- anime" in result
|
||||||
|
assert "Model Description" in result
|
||||||
|
|
||||||
|
def test_widget_last_key_in_frontmatter(self):
|
||||||
|
"""Widget text at end of frontmatter is preserved."""
|
||||||
|
md = """---
|
||||||
|
tags:
|
||||||
|
- lora
|
||||||
|
widget:
|
||||||
|
- output:
|
||||||
|
url: img.png
|
||||||
|
text: prompt
|
||||||
|
---
|
||||||
|
# Content
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "prompt" in result
|
||||||
|
assert "tags:" in result
|
||||||
|
|
||||||
|
def test_no_widget_untouched(self):
|
||||||
|
md = """---
|
||||||
|
tags:
|
||||||
|
- lora
|
||||||
|
base_model: flux
|
||||||
|
---
|
||||||
|
# Content
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "tags:" in result
|
||||||
|
assert "base_model: flux" in result
|
||||||
|
|
||||||
|
# -- gallery stripping ---------------------------------------------------
|
||||||
|
|
||||||
|
def test_gallery_tag_stripped(self):
|
||||||
|
md = "Some text\n<Gallery />\nmore text"
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "<Gallery" not in result
|
||||||
|
|
||||||
|
# -- code block stripping ------------------------------------------------
|
||||||
|
|
||||||
|
def test_fenced_code_block_stripped(self):
|
||||||
|
md = """## Usage
|
||||||
|
```python
|
||||||
|
import torch
|
||||||
|
pipe = DiffusionPipeline.from_pretrained('base')
|
||||||
|
```
|
||||||
|
## Description
|
||||||
|
Some text.
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "import torch" not in result
|
||||||
|
assert "DiffusionPipeline" not in result
|
||||||
|
assert "## Usage" in result
|
||||||
|
assert "## Description" in result
|
||||||
|
|
||||||
|
def test_bash_code_block_stripped(self):
|
||||||
|
md = """## Setup
|
||||||
|
```bash
|
||||||
|
pip install diffusers
|
||||||
|
huggingface-cli download repo
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "pip install" not in result
|
||||||
|
assert "## Setup" in result
|
||||||
|
|
||||||
|
def test_code_block_sections_remain_separated(self):
|
||||||
|
md = "## Install\n```bash\npip install x\n```\n\n## Usage\nSome text."
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "pip install" not in result
|
||||||
|
assert "## Install" in result
|
||||||
|
assert "## Usage" in result
|
||||||
|
assert "Some text." in result
|
||||||
|
|
||||||
|
def test_unmarked_code_block_preserved(self):
|
||||||
|
"""Unmarked fenced code blocks (just ```) are kept since they
|
||||||
|
often contain trigger words rather than code."""
|
||||||
|
md = """### Trigger Words
|
||||||
|
|
||||||
|
Always include:
|
||||||
|
|
||||||
|
```
|
||||||
|
pixel art sprite, game asset, transparent background
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "pixel art sprite" in result
|
||||||
|
assert "game asset" in result
|
||||||
|
assert "transparent background" in result
|
||||||
|
|
||||||
|
def test_unmarked_code_block_with_python_preserved(self):
|
||||||
|
"""Even unmarked blocks with Python code are kept (false positive
|
||||||
|
accepted because trigger-word blocks are unmarked)."""
|
||||||
|
md = "## Setup\n```\nimport torch\nprint('hello')\n```\n## Desc\nText."
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "import torch" in result
|
||||||
|
|
||||||
|
# -- standalone image stripping ------------------------------------------
|
||||||
|
|
||||||
|
def test_standalone_image_stripped(self):
|
||||||
|
md = "## Gallery\n\n\n\nSome text."
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "cdn.hf.co" not in result
|
||||||
|
assert "sample" in result # alt text preserved
|
||||||
|
assert "another" in result # alt text preserved
|
||||||
|
assert "## Gallery" in result
|
||||||
|
assert "Some text." in result
|
||||||
|
|
||||||
|
def test_html_img_tag_stripped(self):
|
||||||
|
md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "cdn.hf.co" not in result
|
||||||
|
assert "Description." in result
|
||||||
|
|
||||||
|
def test_inline_image_within_paragraph_preserved(self):
|
||||||
|
"""Inline images inside paragraphs are rare but shouldn't be stripped."""
|
||||||
|
md = "Click here  for more info."
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "Click here" in result
|
||||||
|
assert "for more info" in result
|
||||||
|
|
||||||
|
# -- training table stripping --------------------------------------------
|
||||||
|
|
||||||
|
def test_training_table_stripped(self):
|
||||||
|
md = """## Training
|
||||||
|
| Parameter | Value |
|
||||||
|
|---------------|----------|
|
||||||
|
| LR Scheduler | constant |
|
||||||
|
| Optimizer | AdamW |
|
||||||
|
| Network Dim | 64 |
|
||||||
|
## Best Dimensions
|
||||||
|
| Resolution | Status |
|
||||||
|
|-----------|---------|
|
||||||
|
| 768x1024 | Best |
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "LR Scheduler" not in result
|
||||||
|
assert "Optimizer" not in result
|
||||||
|
assert "Network Dim" not in result
|
||||||
|
# Normal table preserved
|
||||||
|
assert "Best Dimensions" in result
|
||||||
|
assert "768x1024" in result
|
||||||
|
|
||||||
|
def test_normal_table_preserved(self):
|
||||||
|
md = """## Recommended
|
||||||
|
| Resolution | Status |
|
||||||
|
|-----------|---------|
|
||||||
|
| 1024x1024 | Default |
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "1024x1024" in result
|
||||||
|
|
||||||
|
# -- boilerplate section stripping ---------------------------------------
|
||||||
|
|
||||||
|
def test_boilerplate_license_stripped(self):
|
||||||
|
md = """## Description
|
||||||
|
Some text.
|
||||||
|
## License
|
||||||
|
apache-2.0
|
||||||
|
Some license details here.
|
||||||
|
## More Content
|
||||||
|
After license.
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "apache-2.0" not in result
|
||||||
|
assert "## License" not in result
|
||||||
|
assert "## Description" in result
|
||||||
|
assert "## More Content" in result
|
||||||
|
assert "After license." in result
|
||||||
|
|
||||||
|
def test_boilerplate_disclaimer_stripped(self):
|
||||||
|
md = """## Description
|
||||||
|
Some text.
|
||||||
|
## DISCLAIMER
|
||||||
|
Legal text here.
|
||||||
|
## Citation
|
||||||
|
Bibtex here.
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "Legal text" not in result
|
||||||
|
assert "Bibtex" not in result
|
||||||
|
assert "Some text." in result
|
||||||
|
|
||||||
|
def test_boilerplate_subsection_not_stripped(self):
|
||||||
|
"""Only top-level (##) boilerplate is stripped; ### subsections inside
|
||||||
|
non-boilerplate headings are left alone."""
|
||||||
|
md = """## Usage
|
||||||
|
Some text.
|
||||||
|
### Important Note
|
||||||
|
This is a note within the usage section.
|
||||||
|
"""
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "Important Note" in result
|
||||||
|
|
||||||
|
# -- massive list stripping ----------------------------------------------
|
||||||
|
|
||||||
|
def test_massive_name_list_stripped(self):
|
||||||
|
lines = ["## 2026 Updates:"]
|
||||||
|
for i in range(12):
|
||||||
|
lines.append(f"Name{i}A, Name{i}B, Name{i}C, Name{i}D, Name{i}E,")
|
||||||
|
lines.append("## License")
|
||||||
|
lines.append("apache")
|
||||||
|
md = "\n".join(lines)
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "Name0A" not in result
|
||||||
|
assert "Name11E" not in result
|
||||||
|
assert "## 2026 Updates:" in result
|
||||||
|
# License stripped by boilerplate
|
||||||
|
assert "apache" not in result
|
||||||
|
|
||||||
|
def test_short_list_preserved(self):
|
||||||
|
"""Short lists (< 8 consecutive lines) should not be stripped."""
|
||||||
|
lines = ["## Tags:"]
|
||||||
|
for i in range(4):
|
||||||
|
lines.append(f"tag{i}A, tag{i}B,")
|
||||||
|
lines.append("## Description")
|
||||||
|
lines.append("Some text.")
|
||||||
|
md = "\n".join(lines)
|
||||||
|
result = self._clean(md)
|
||||||
|
assert "tag0A" in result
|
||||||
|
assert "tag3B" in result
|
||||||
|
|
||||||
|
# -- max_length truncation -----------------------------------------------
|
||||||
|
|
||||||
|
def test_truncation(self):
|
||||||
|
md = "A" * 100 + "\n" + "B" * 100
|
||||||
|
result = self._clean(md, max_length=150)
|
||||||
|
assert len(result) <= 150
|
||||||
|
assert result.startswith("A" * 100)
|
||||||
|
|
||||||
|
# -- integration: end-to-end realistic README ----------------------------
|
||||||
|
|
||||||
|
def test_realistic_flux_lora_readme(self):
|
||||||
|
md = """---
|
||||||
|
tags:
|
||||||
|
- text-to-image
|
||||||
|
- lora
|
||||||
|
- diffusers
|
||||||
|
- 3D
|
||||||
|
- Toon
|
||||||
|
widget:
|
||||||
|
- text: >-
|
||||||
|
Long toons, a close-up of a cartoon character face...
|
||||||
|
output:
|
||||||
|
url: images/LT4.png
|
||||||
|
- text: >-
|
||||||
|
Long toons, Super Detail, a close-up shot...
|
||||||
|
output:
|
||||||
|
url: images/LT5.png
|
||||||
|
base_model: black-forest-labs/FLUX.1-dev
|
||||||
|
instance_prompt: Long toons
|
||||||
|
license: creativeml-openrail-m
|
||||||
|
---
|
||||||
|
# Flux-Long-Toon-LoRA
|
||||||
|
|
||||||
|
<Gallery />
|
||||||
|
|
||||||
|
**The model is still in the training phase.**
|
||||||
|
|
||||||
|
## Model description
|
||||||
|
|
||||||
|
**prithivMLmods/Flux-Long-Toon-LoRA**
|
||||||
|
|
||||||
|
Image Processing Parameters
|
||||||
|
|
||||||
|
| Parameter | Value | Parameter | Value |
|
||||||
|
|---------------------------|--------|---------------------------|--------|
|
||||||
|
| LR Scheduler | constant | Noise Offset | 0.03 |
|
||||||
|
| Optimizer | AdamW | Multires Noise Discount | 0.1 |
|
||||||
|
| Network Dim | 64 | Multires Noise Iterations | 10 |
|
||||||
|
| Network Alpha | 32 | Repeat & Steps | 25 & 3270 |
|
||||||
|
| Epoch | 18 | Save Every N Epochs | 1 |
|
||||||
|
|
||||||
|
## Best Dimensions
|
||||||
|
|
||||||
|
- 768 x 1024 (Best)
|
||||||
|
- 1024 x 1024 (Default)
|
||||||
|
|
||||||
|
## Setting Up
|
||||||
|
```python
|
||||||
|
import torch
|
||||||
|
from pipelines import DiffusionPipeline
|
||||||
|
|
||||||
|
base_model = "black-forest-labs/FLUX.1-dev"
|
||||||
|
pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
lora_repo = "prithivMLmods/Flux-Long-Toon-LoRA"
|
||||||
|
trigger_word = "Long toons"
|
||||||
|
pipe.load_lora_weights(lora_repo)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Trigger words
|
||||||
|
|
||||||
|
You should use `Long toons` to trigger the image generation.
|
||||||
|
|
||||||
|
## Download model
|
||||||
|
|
||||||
|
Weights for this model are available in Safetensors format.
|
||||||
|
"""
|
||||||
|
original_len = len(md)
|
||||||
|
result = self._clean(md)
|
||||||
|
|
||||||
|
# Still significantly smaller (widget text is kept but training
|
||||||
|
# tables, code blocks, boilerplate are stripped)
|
||||||
|
assert len(result) < original_len * 0.7, (
|
||||||
|
f"Expected <70% of original, got {len(result)}/{original_len}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Signal preserved
|
||||||
|
assert "Long toons" in result
|
||||||
|
assert "black-forest-labs/FLUX.1-dev" in result
|
||||||
|
assert "3D" in result
|
||||||
|
assert "Toon" in result
|
||||||
|
|
||||||
|
# Widget content preserved (text is valuable signal for tags/desc)
|
||||||
|
assert "close-up of a cartoon character face" in result
|
||||||
|
assert "Super Detail" in result
|
||||||
|
|
||||||
|
# Noise stripped
|
||||||
|
assert "import torch" not in result
|
||||||
|
assert "DiffusionPipeline" not in result
|
||||||
|
assert "LR Scheduler" not in result
|
||||||
|
assert "<Gallery" not in result
|
||||||
|
assert "Download model" not in result
|
||||||
|
|||||||
Reference in New Issue
Block a user