mirror of
https://github.com/willmiao/ComfyUI-Lora-Manager.git
synced 2026-07-05 17:01:16 -03:00
refactor(agent): rename md_to_html to readme_processor, fix section extraction, widget parsing, and list_base_models
- Rename md_to_html.py → readme_processor.py (file no longer just HTML conversion) - _extract_section: include YAML frontmatter, use heading-level-aware forward walk (sub-headings under # are included), increase walk limit past 30 lines - _is_heading: exclude </hN> closing tags from boundary detection - _heading_level: new helper for heading-level-aware section matching - css: yield 0 for heading like closing tags, was unexpectedly caught by _is_heading - extract_gallery_images: fix YAML block scalar (text: >-) prompt extraction; use endswith instead of == to detect the block marker - _strip_widget_section: add to clean_readme_for_llm (widget text is handled by post-processor, not needed in LLM prompt) - _strip_standalone_images: keep markdown image URLs intact for LLM preview extraction (was stripping to alt text only) - list_base_models: switch from scanner-cache aggregation to CivitaiBaseModelService.get_base_models() - always returns full list - Ollama: add num_ctx=32768 to payload options so thinking models have room to both reason and produce output - Add tests/agent_cli/test_readme_processor.py: 59 tests covering extraction, cleaning, section matching, heading detection - Update existing tests for behavioral changes
This commit is contained in:
@@ -113,38 +113,29 @@ async def identify_model_type(model_path: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
async def list_base_models(limit: int = 0) -> List[str]:
|
async def list_base_models(limit: int = 0) -> List[str]:
|
||||||
"""Return deduplicated base model names from all model caches.
|
"""Return all valid CivitAI base model names.
|
||||||
|
|
||||||
The result is ordered by frequency (most common first). Pass
|
Uses ``CivitaiBaseModelService.get_base_models()`` which merges a
|
||||||
*limit* = 0 (default) for all models.
|
hardcoded list (``SUPPORTED_DOWNLOAD_SKIP_BASE_MODELS``) with remote
|
||||||
|
models fetched from the CivitAI API. Never empty — the hardcoded
|
||||||
|
fallback always provides a complete set.
|
||||||
|
|
||||||
|
The result is sorted alphabetically. Pass *limit* = 0 for all models.
|
||||||
"""
|
"""
|
||||||
from ..services.service_registry import ServiceRegistry
|
from ..services.civitai_base_model_service import (
|
||||||
|
CivitaiBaseModelService,
|
||||||
|
)
|
||||||
|
|
||||||
counts: Dict[str, int] = {}
|
try:
|
||||||
for getter_name in (
|
service = await CivitaiBaseModelService.get_instance()
|
||||||
"get_lora_scanner",
|
response = await service.get_base_models()
|
||||||
"get_checkpoint_scanner",
|
names: List[str] = response.get("models", [])
|
||||||
"get_embedding_scanner",
|
except Exception as exc:
|
||||||
):
|
logger.warning("list_base_models failed: %s", exc)
|
||||||
getter = getattr(ServiceRegistry, getter_name, None)
|
names = []
|
||||||
if getter is None:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
scanner = await getter()
|
|
||||||
if scanner is None:
|
|
||||||
continue
|
|
||||||
cache = await scanner.get_cached_data()
|
|
||||||
for entry in cache.raw_data:
|
|
||||||
bm = entry.get("base_model")
|
|
||||||
if bm:
|
|
||||||
counts[bm] = counts.get(bm, 0) + 1
|
|
||||||
except Exception as exc:
|
|
||||||
logger.debug("list_base_models scanner %s error: %s", getter_name, exc)
|
|
||||||
|
|
||||||
sorted_names = [name for name, _ in sorted(counts.items(), key=lambda x: -x[1])]
|
|
||||||
if limit > 0:
|
if limit > 0:
|
||||||
return sorted_names[:limit]
|
return names[:limit]
|
||||||
return sorted_names
|
return names
|
||||||
|
|
||||||
|
|
||||||
async def read_metadata(model_path: str) -> Dict[str, Any]:
|
async def read_metadata(model_path: str) -> Dict[str, Any]:
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from ..llm_service import LLMService
|
|||||||
from ..websocket_manager import ws_manager
|
from ..websocket_manager import ws_manager
|
||||||
from .post_processor import PostProcessor
|
from .post_processor import PostProcessor
|
||||||
from .skill_registry import SkillRegistry
|
from .skill_registry import SkillRegistry
|
||||||
from .skills.enrich_hf_metadata.md_to_html import (
|
from .skills.enrich_hf_metadata.readme_processor import (
|
||||||
clean_readme_for_llm,
|
clean_readme_for_llm,
|
||||||
extract_relevant_section,
|
extract_relevant_section,
|
||||||
)
|
)
|
||||||
@@ -397,6 +397,10 @@ class AgentService:
|
|||||||
cleaned = clean_readme_for_llm(readme) if readme else ""
|
cleaned = clean_readme_for_llm(readme) if readme else ""
|
||||||
context["readme_content"] = cleaned if cleaned else "(README not available)"
|
context["readme_content"] = cleaned if cleaned else "(README not available)"
|
||||||
context["readme_content_full"] = readme or ""
|
context["readme_content_full"] = readme or ""
|
||||||
|
logger.info(
|
||||||
|
"Cleaned README for %s (%d chars): ---BEGIN---\n%s\n---END---",
|
||||||
|
repo, len(cleaned), cleaned[:800] if cleaned else "(empty)",
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
context["base_models"] = await list_base_models()
|
context["base_models"] = await list_base_models()
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ class PostProcessor:
|
|||||||
download_preview,
|
download_preview,
|
||||||
refresh_cache,
|
refresh_cache,
|
||||||
)
|
)
|
||||||
from .skills.enrich_hf_metadata.md_to_html import (
|
from .skills.enrich_hf_metadata.readme_processor import (
|
||||||
convert_readme_to_html,
|
convert_readme_to_html,
|
||||||
extract_gallery_images,
|
extract_gallery_images,
|
||||||
extract_gallery_table_images,
|
extract_gallery_table_images,
|
||||||
|
|||||||
@@ -1,13 +1,8 @@
|
|||||||
"""Inline markdown-to-HTML converter and LLM-prompt cleaner for HF README content.
|
"""HF README processing for the ``enrich_hf_metadata`` skill.
|
||||||
|
|
||||||
No external dependencies. Strips YAML frontmatter, ``<Gallery />`` sections,
|
Provides README cleaning for LLM injection, gallery/image extraction from
|
||||||
badge images, and HTML comments before rendering. Used by the
|
multiple formats (YAML widget, markdown, HTML ``<img>``, gallery tables),
|
||||||
``enrich_hf_metadata`` feature.
|
and section-based README trimming for collection repos.
|
||||||
|
|
||||||
Also provides :func:`clean_readme_for_llm` which pre-processes the raw README
|
|
||||||
before it is injected into the LLM prompt, removing content that has zero value
|
|
||||||
for metadata extraction (widget sections, code blocks, training tables,
|
|
||||||
boilerplate, massive lists, etc.).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@@ -241,7 +236,26 @@ def extract_gallery_images(
|
|||||||
if text_match:
|
if text_match:
|
||||||
raw_text = text_match.group(1).strip().strip("'\"")
|
raw_text = text_match.group(1).strip().strip("'\"")
|
||||||
if raw_text and raw_text != "-":
|
if raw_text and raw_text != "-":
|
||||||
text = raw_text
|
# Handle YAML block scalar markers (>-, >, |, |-) where the
|
||||||
|
# actual text lives on subsequent indented lines.
|
||||||
|
if raw_text in (">", ">-", "|", "|-"):
|
||||||
|
text_lines: list[str] = []
|
||||||
|
in_block = False
|
||||||
|
for line in entry.split("\n"):
|
||||||
|
stripped = line.strip()
|
||||||
|
if not in_block:
|
||||||
|
if stripped.endswith(raw_text):
|
||||||
|
in_block = True
|
||||||
|
continue
|
||||||
|
# Block content ends at a line with less indentation
|
||||||
|
# or a YAML key at the start of a line.
|
||||||
|
if not stripped or re.match(r"^\s*\w+:", line):
|
||||||
|
break
|
||||||
|
if stripped:
|
||||||
|
text_lines.append(stripped)
|
||||||
|
text = " ".join(text_lines)
|
||||||
|
else:
|
||||||
|
text = raw_text
|
||||||
|
|
||||||
if url:
|
if url:
|
||||||
image: dict = {
|
image: dict = {
|
||||||
@@ -439,6 +453,7 @@ def clean_readme_for_llm(markdown_text: str | None, max_length: int = 6000) -> s
|
|||||||
|
|
||||||
# Order matters — broader strips first, then finer ones.
|
# Order matters — broader strips first, then finer ones.
|
||||||
text = _strip_gallery(text)
|
text = _strip_gallery(text)
|
||||||
|
text = _strip_widget_section(text)
|
||||||
text = _strip_fenced_code_blocks(text)
|
text = _strip_fenced_code_blocks(text)
|
||||||
text = _strip_standalone_images(text)
|
text = _strip_standalone_images(text)
|
||||||
text = _strip_training_tables(text)
|
text = _strip_training_tables(text)
|
||||||
@@ -722,6 +737,18 @@ def _looks_like_download_link(line: str) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _heading_level(line: str) -> int:
|
||||||
|
"""Return the heading level of *line* (1-4), or 0 if not a heading."""
|
||||||
|
stripped = line.strip()
|
||||||
|
m = re.match(r"^(#{1,4})\s", stripped)
|
||||||
|
if m:
|
||||||
|
return len(m.group(1))
|
||||||
|
m = re.match(r"^<h([1-4])(?:\s|>)", stripped, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
return int(m.group(1))
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def _extract_section(
|
def _extract_section(
|
||||||
lines: list[str], match_idx: int, context_lines: int,
|
lines: list[str], match_idx: int, context_lines: int,
|
||||||
) -> str:
|
) -> str:
|
||||||
@@ -729,15 +756,23 @@ def _extract_section(
|
|||||||
|
|
||||||
When *match_idx* is itself a heading line, the section starts *at*
|
When *match_idx* is itself a heading line, the section starts *at*
|
||||||
that heading (no backward walk), avoiding pulling in content from
|
that heading (no backward walk), avoiding pulling in content from
|
||||||
earlier sibling sections.
|
earlier sibling sections. The forward walk only stops at a heading
|
||||||
|
of **equal or higher** level (e.g. a ``#`` match includes all its
|
||||||
|
``##`` children).
|
||||||
|
|
||||||
|
Always includes the YAML frontmatter if the original lines contain one,
|
||||||
|
because it carries critical metadata (``base_model``, ``tags``,
|
||||||
|
``instance_prompt``) that the LLM needs regardless of which section
|
||||||
|
matches.
|
||||||
"""
|
"""
|
||||||
n = len(lines)
|
n = len(lines)
|
||||||
|
|
||||||
# Determine start — if match is a heading, start right there
|
# Determine start — if match is a heading, start right there
|
||||||
if _is_heading(lines[match_idx]):
|
if _is_heading(lines[match_idx]):
|
||||||
start = match_idx
|
start = match_idx
|
||||||
|
match_level = _heading_level(lines[match_idx])
|
||||||
else:
|
else:
|
||||||
# Walk backward to find the nearest heading
|
match_level = 0
|
||||||
start = max(0, match_idx - context_lines)
|
start = max(0, match_idx - context_lines)
|
||||||
for i in range(match_idx - 1, max(-1, match_idx - context_lines * 3), -1):
|
for i in range(match_idx - 1, max(-1, match_idx - context_lines * 3), -1):
|
||||||
if i < 0:
|
if i < 0:
|
||||||
@@ -747,13 +782,25 @@ def _extract_section(
|
|||||||
start = i
|
start = i
|
||||||
break
|
break
|
||||||
|
|
||||||
# Walk forward to find the next heading at same or higher level
|
# Walk forward. Stop at a heading of EQUAL or HIGHER (fewer #) level,
|
||||||
end = min(n, match_idx + context_lines)
|
# so that a ``# Title`` match encompasses all its ``## Children``.
|
||||||
for i in range(match_idx + 1, min(n, match_idx + context_lines * 3)):
|
# Start from the full remaining lines so we don't truncate content
|
||||||
if _is_heading(lines[i]):
|
# when the YAML frontmatter pushes the matched heading far down.
|
||||||
|
end = n
|
||||||
|
walk_limit = min(n, match_idx + max(context_lines * 3, 120))
|
||||||
|
for i in range(match_idx + 1, walk_limit):
|
||||||
|
hl = _heading_level(lines[i])
|
||||||
|
if hl > 0 and (match_level == 0 or hl <= match_level):
|
||||||
end = i
|
end = i
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# If YAML frontmatter exists before the matched section, prepend it.
|
||||||
|
if start > 0 and len(lines) > 1 and lines[0].strip() == "---":
|
||||||
|
for i in range(1, min(start, len(lines))):
|
||||||
|
if lines[i].strip() == "---":
|
||||||
|
yaml_section = "\n".join(lines[:i+1])
|
||||||
|
return yaml_section + "\n" + "\n".join(lines[start:end])
|
||||||
|
|
||||||
return "\n".join(lines[start:end])
|
return "\n".join(lines[start:end])
|
||||||
|
|
||||||
|
|
||||||
@@ -801,6 +848,26 @@ def _strip_gallery(text: str) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_widget_section(text: str) -> str:
|
||||||
|
"""Strip the ``widget:`` YAML block from the README frontmatter.
|
||||||
|
|
||||||
|
The widget section contains verbose example prompts (``text: >-`` entries)
|
||||||
|
that are useful for post-processor gallery image extraction but carry
|
||||||
|
no signal for LLM metadata extraction. Stripping them dramatically
|
||||||
|
reduces prompt size (e.g. 2800+ chars → ~100 chars) and lets the LLM
|
||||||
|
focus on the actual YAML metadata fields (``base_model``, ``tags``,
|
||||||
|
``instance_prompt``, etc.).
|
||||||
|
"""
|
||||||
|
# Match widget: through the end of the frontmatter (the closing ---)
|
||||||
|
# or until the next YAML top-level key.
|
||||||
|
return re.sub(
|
||||||
|
r"\nwidget:.*?(?=\n\w+:|\n---)",
|
||||||
|
"",
|
||||||
|
text,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _strip_badge_images(text: str) -> str:
|
def _strip_badge_images(text: str) -> str:
|
||||||
badge_keywords = (
|
badge_keywords = (
|
||||||
"badge", "shield", "logo", "icon", "download", "license",
|
"badge", "shield", "logo", "icon", "download", "license",
|
||||||
@@ -364,6 +364,9 @@ class LLMService:
|
|||||||
"think": False,
|
"think": False,
|
||||||
"options": {
|
"options": {
|
||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
|
# Allow up to 32K context so the model has room to think
|
||||||
|
# AND produce output without hitting the 4K default limit.
|
||||||
|
"num_ctx": 32768,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
if response_format is not None:
|
if response_format is not None:
|
||||||
@@ -381,6 +384,16 @@ class LLMService:
|
|||||||
if max_tokens is not None:
|
if max_tokens is not None:
|
||||||
payload["max_tokens"] = max_tokens
|
payload["max_tokens"] = max_tokens
|
||||||
|
|
||||||
|
if is_ollama:
|
||||||
|
logger.info(
|
||||||
|
"Ollama request: model=%s num_ctx=%s num_predict=%s format=%s think=%s",
|
||||||
|
payload.get("model"),
|
||||||
|
payload.get("options", {}).get("num_ctx"),
|
||||||
|
payload.get("options", {}).get("num_predict"),
|
||||||
|
payload.get("format", "none"),
|
||||||
|
payload.get("think"),
|
||||||
|
)
|
||||||
|
|
||||||
headers = self._build_headers(cfg["api_key"])
|
headers = self._build_headers(cfg["api_key"])
|
||||||
|
|
||||||
attempt = 0
|
attempt = 0
|
||||||
@@ -507,8 +520,23 @@ class LLMService:
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return json.loads(result["content"])
|
parsed = json.loads(result["content"])
|
||||||
|
logger.info(
|
||||||
|
"LLM response base_model=%s tags=%s confidence=%s",
|
||||||
|
parsed.get("base_model", "?")[:50],
|
||||||
|
parsed.get("tags", []),
|
||||||
|
parsed.get("confidence", "?"),
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"LLM raw content: %s",
|
||||||
|
(result.get("content") or "")[:1200],
|
||||||
|
)
|
||||||
|
return parsed
|
||||||
except (json.JSONDecodeError, TypeError) as exc:
|
except (json.JSONDecodeError, TypeError) as exc:
|
||||||
|
logger.info(
|
||||||
|
"LLM raw response (first 800 chars): %s",
|
||||||
|
(result.get("content") or "")[:800],
|
||||||
|
)
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"LLM JSON parse failed on first attempt: %s. Retrying.", exc
|
"LLM JSON parse failed on first attempt: %s. Retrying.", exc
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -50,78 +50,74 @@ class MockScanner:
|
|||||||
|
|
||||||
class TestListBaseModels:
|
class TestListBaseModels:
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
_MOCK_MODELS = ["SDXL 1.0", "Flux.1 D", "SD 1.5"]
|
||||||
async def test_empty_cache(self):
|
|
||||||
scanner = MockScanner([])
|
|
||||||
with mock.patch(
|
|
||||||
"py.services.service_registry.ServiceRegistry",
|
|
||||||
get_lora_scanner=mock.AsyncMock(return_value=scanner),
|
|
||||||
get_checkpoint_scanner=mock.AsyncMock(return_value=None),
|
|
||||||
get_embedding_scanner=mock.AsyncMock(return_value=None),
|
|
||||||
):
|
|
||||||
result = await list_base_models()
|
|
||||||
assert result == []
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_merges_all_scanners(self):
|
async def test_returns_all_models(self):
|
||||||
lora_scanner = MockScanner([
|
"""Verifies the function delegates to CivitaiBaseModelService.
|
||||||
{"base_model": "SDXL 1.0"},
|
|
||||||
{"base_model": "Flux.1 D"},
|
Uses a monkey-patch on ``get_instance`` to return a controlled mock
|
||||||
{"base_model": "SDXL 1.0"},
|
so we don't need to work around ``mock.patch``'s dotted-path
|
||||||
])
|
limitations with lazy imports inside function bodies."""
|
||||||
ckpt_scanner = MockScanner([
|
import py.services.civitai_base_model_service as _svc
|
||||||
{"base_model": "SDXL 1.0"},
|
orig = _svc.CivitaiBaseModelService.get_instance
|
||||||
{"base_model": "SD 1.5"},
|
mock_svc = mock.AsyncMock()
|
||||||
])
|
mock_svc.get_base_models.return_value = {
|
||||||
with mock.patch(
|
"models": self._MOCK_MODELS,
|
||||||
"py.services.service_registry.ServiceRegistry",
|
}
|
||||||
get_lora_scanner=mock.AsyncMock(return_value=lora_scanner),
|
_svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
|
||||||
get_checkpoint_scanner=mock.AsyncMock(return_value=ckpt_scanner),
|
return_value=mock_svc,
|
||||||
get_embedding_scanner=mock.AsyncMock(return_value=None),
|
)
|
||||||
):
|
try:
|
||||||
result = await list_base_models()
|
result = await list_base_models()
|
||||||
assert result == ["SDXL 1.0", "Flux.1 D", "SD 1.5"]
|
assert result == self._MOCK_MODELS
|
||||||
|
finally:
|
||||||
|
_svc.CivitaiBaseModelService.get_instance = orig
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_limit(self):
|
async def test_limit(self):
|
||||||
scanner = MockScanner([
|
import py.services.civitai_base_model_service as _svc
|
||||||
{"base_model": "A"}, {"base_model": "B"}, {"base_model": "C"},
|
orig = _svc.CivitaiBaseModelService.get_instance
|
||||||
])
|
mock_svc = mock.AsyncMock()
|
||||||
with mock.patch(
|
mock_svc.get_base_models.return_value = {"models": ["A", "B", "C"]}
|
||||||
"py.services.service_registry.ServiceRegistry",
|
_svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
|
||||||
get_lora_scanner=mock.AsyncMock(return_value=scanner),
|
return_value=mock_svc,
|
||||||
get_checkpoint_scanner=mock.AsyncMock(return_value=None),
|
)
|
||||||
get_embedding_scanner=mock.AsyncMock(return_value=None),
|
try:
|
||||||
):
|
|
||||||
result = await list_base_models(limit=2)
|
result = await list_base_models(limit=2)
|
||||||
assert result == ["A", "B"]
|
assert result == ["A", "B"]
|
||||||
|
finally:
|
||||||
|
_svc.CivitaiBaseModelService.get_instance = orig
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_all_scanners_return_none(self):
|
async def test_empty_list_when_service_returns_empty(self):
|
||||||
with mock.patch(
|
import py.services.civitai_base_model_service as _svc
|
||||||
"py.services.service_registry.ServiceRegistry",
|
orig = _svc.CivitaiBaseModelService.get_instance
|
||||||
get_lora_scanner=mock.AsyncMock(return_value=None),
|
mock_svc = mock.AsyncMock()
|
||||||
get_checkpoint_scanner=mock.AsyncMock(return_value=None),
|
mock_svc.get_base_models.return_value = {"models": []}
|
||||||
get_embedding_scanner=mock.AsyncMock(return_value=None),
|
_svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
|
||||||
):
|
return_value=mock_svc,
|
||||||
|
)
|
||||||
|
try:
|
||||||
result = await list_base_models()
|
result = await list_base_models()
|
||||||
assert result == []
|
assert result == []
|
||||||
|
finally:
|
||||||
|
_svc.CivitaiBaseModelService.get_instance = orig
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_skips_empty_or_missing_base_model(self):
|
async def test_handles_exception(self):
|
||||||
scanner = MockScanner([
|
import py.services.civitai_base_model_service as _svc
|
||||||
{"base_model": "SDXL 1.0"},
|
orig = _svc.CivitaiBaseModelService.get_instance
|
||||||
{"file_name": "foo.safetensors"}, # no base_model key
|
mock_svc = mock.AsyncMock()
|
||||||
{"base_model": ""}, # empty
|
mock_svc.get_base_models.side_effect = RuntimeError("API error")
|
||||||
])
|
_svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
|
||||||
with mock.patch(
|
return_value=mock_svc,
|
||||||
"py.services.service_registry.ServiceRegistry",
|
)
|
||||||
get_lora_scanner=mock.AsyncMock(return_value=scanner),
|
try:
|
||||||
get_checkpoint_scanner=mock.AsyncMock(return_value=None),
|
|
||||||
get_embedding_scanner=mock.AsyncMock(return_value=None),
|
|
||||||
):
|
|
||||||
result = await list_base_models()
|
result = await list_base_models()
|
||||||
assert result == ["SDXL 1.0"]
|
assert result == []
|
||||||
|
finally:
|
||||||
|
_svc.CivitaiBaseModelService.get_instance = orig
|
||||||
|
|
||||||
|
|
||||||
# ======================================================================
|
# ======================================================================
|
||||||
@@ -326,21 +322,21 @@ class TestConvertReadmeToHtml:
|
|||||||
"""Tests for the inline markdown→HTML converter."""
|
"""Tests for the inline markdown→HTML converter."""
|
||||||
|
|
||||||
def test_empty_input(self):
|
def test_empty_input(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
assert convert_readme_to_html("") == ""
|
assert convert_readme_to_html("") == ""
|
||||||
assert convert_readme_to_html(None) == "" # type: ignore[arg-type]
|
assert convert_readme_to_html(None) == "" # type: ignore[arg-type]
|
||||||
|
|
||||||
def test_heading(self):
|
def test_heading(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
result = convert_readme_to_html("# Title")
|
result = convert_readme_to_html("# Title")
|
||||||
assert "<h1>" in result and "Title" in result
|
assert "<h1>" in result and "Title" in result
|
||||||
|
|
||||||
def test_subheadings(self):
|
def test_subheadings(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "## Overview\n\n### Details"
|
md = "## Overview\n\n### Details"
|
||||||
@@ -349,7 +345,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "<h3>Details</h3>" in result
|
assert "<h3>Details</h3>" in result
|
||||||
|
|
||||||
def test_bold_and_italic(self):
|
def test_bold_and_italic(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "**bold** and *italic*"
|
md = "**bold** and *italic*"
|
||||||
@@ -358,7 +354,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "<em>italic</em>" in result
|
assert "<em>italic</em>" in result
|
||||||
|
|
||||||
def test_inline_code(self):
|
def test_inline_code(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "Use `model.train()`"
|
md = "Use `model.train()`"
|
||||||
@@ -366,7 +362,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "<code>" in result and "model.train()" in result
|
assert "<code>" in result and "model.train()" in result
|
||||||
|
|
||||||
def test_fenced_code_block(self):
|
def test_fenced_code_block(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "```python\nprint('hello')\n```"
|
md = "```python\nprint('hello')\n```"
|
||||||
@@ -375,7 +371,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "print" in result and "hello" in result
|
assert "print" in result and "hello" in result
|
||||||
|
|
||||||
def test_unordered_list(self):
|
def test_unordered_list(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "- item one\n- item two"
|
md = "- item one\n- item two"
|
||||||
@@ -385,7 +381,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "<li>item two</li>" in result
|
assert "<li>item two</li>" in result
|
||||||
|
|
||||||
def test_ordered_list(self):
|
def test_ordered_list(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "1. first\n2. second"
|
md = "1. first\n2. second"
|
||||||
@@ -395,7 +391,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "<li>second</li>" in result
|
assert "<li>second</li>" in result
|
||||||
|
|
||||||
def test_link(self):
|
def test_link(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "[click here](https://example.com)"
|
md = "[click here](https://example.com)"
|
||||||
@@ -403,7 +399,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert '<a href="https://example.com">click here</a>' in result
|
assert '<a href="https://example.com">click here</a>' in result
|
||||||
|
|
||||||
def test_badge_image_stripped(self):
|
def test_badge_image_stripped(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = ""
|
md = ""
|
||||||
@@ -411,7 +407,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "img.shields.io" not in result
|
assert "img.shields.io" not in result
|
||||||
|
|
||||||
def test_gallery_stripped(self):
|
def test_gallery_stripped(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "Some text\n<Gallery />\nmore text"
|
md = "Some text\n<Gallery />\nmore text"
|
||||||
@@ -419,7 +415,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "<Gallery" not in result
|
assert "<Gallery" not in result
|
||||||
|
|
||||||
def test_yaml_frontmatter_stripped(self):
|
def test_yaml_frontmatter_stripped(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "---\ntags:\n - lora\nbase_model: flux\n---\n\n# Real content"
|
md = "---\ntags:\n - lora\nbase_model: flux\n---\n\n# Real content"
|
||||||
@@ -428,7 +424,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "<h1>Real content</h1>" in result
|
assert "<h1>Real content</h1>" in result
|
||||||
|
|
||||||
def test_table(self):
|
def test_table(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "| A | B |\n|---|---|\n| 1 | 2 |"
|
md = "| A | B |\n|---|---|\n| 1 | 2 |"
|
||||||
@@ -438,7 +434,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "<td>1</td>" in result
|
assert "<td>1</td>" in result
|
||||||
|
|
||||||
def test_horizontal_rule(self):
|
def test_horizontal_rule(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "before\n\n---\n\nafter"
|
md = "before\n\n---\n\nafter"
|
||||||
@@ -446,14 +442,14 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "<hr>" in result
|
assert "<hr>" in result
|
||||||
|
|
||||||
def test_inline_code_preserves_angle_bracket(self):
|
def test_inline_code_preserves_angle_bracket(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
result = convert_readme_to_html("Use `a < b` in code")
|
result = convert_readme_to_html("Use `a < b` in code")
|
||||||
assert "<code>a < b</code>" in result
|
assert "<code>a < b</code>" in result
|
||||||
|
|
||||||
def test_blockquote(self):
|
def test_blockquote(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "> quoted text"
|
md = "> quoted text"
|
||||||
@@ -462,7 +458,7 @@ class TestConvertReadmeToHtml:
|
|||||||
assert "quoted text" in result
|
assert "quoted text" in result
|
||||||
|
|
||||||
def test_indented_whitespace_not_treated_as_code(self):
|
def test_indented_whitespace_not_treated_as_code(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
convert_readme_to_html
|
convert_readme_to_html
|
||||||
|
|
||||||
md = "- item\n \n## heading after spacing"
|
md = "- item\n \n## heading after spacing"
|
||||||
@@ -497,7 +493,7 @@ base_model: flux
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def test_extracts_widget_images(self):
|
def test_extracts_widget_images(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
extract_gallery_images
|
extract_gallery_images
|
||||||
|
|
||||||
images = extract_gallery_images(self._README, self._REPO)
|
images = extract_gallery_images(self._README, self._REPO)
|
||||||
@@ -519,7 +515,7 @@ base_model: flux
|
|||||||
assert images[1]["meta"]["prompt"] == "multi line prompt here"
|
assert images[1]["meta"]["prompt"] == "multi line prompt here"
|
||||||
|
|
||||||
def test_default_dimensions_used(self):
|
def test_default_dimensions_used(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
extract_gallery_images
|
extract_gallery_images
|
||||||
|
|
||||||
images = extract_gallery_images(self._README, self._REPO)
|
images = extract_gallery_images(self._README, self._REPO)
|
||||||
@@ -527,7 +523,7 @@ base_model: flux
|
|||||||
assert images[0]["height"] == 512
|
assert images[0]["height"] == 512
|
||||||
|
|
||||||
def test_custom_dimensions_applied(self):
|
def test_custom_dimensions_applied(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
extract_gallery_images
|
extract_gallery_images
|
||||||
|
|
||||||
images = extract_gallery_images(
|
images = extract_gallery_images(
|
||||||
@@ -538,27 +534,27 @@ base_model: flux
|
|||||||
assert images[0]["height"] == 1024
|
assert images[0]["height"] == 1024
|
||||||
|
|
||||||
def test_empty_readme_returns_empty(self):
|
def test_empty_readme_returns_empty(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
extract_gallery_images
|
extract_gallery_images
|
||||||
|
|
||||||
assert extract_gallery_images("", self._REPO) == []
|
assert extract_gallery_images("", self._REPO) == []
|
||||||
assert extract_gallery_images("no frontmatter here", self._REPO) == []
|
assert extract_gallery_images("no frontmatter here", self._REPO) == []
|
||||||
|
|
||||||
def test_empty_repo_returns_empty(self):
|
def test_empty_repo_returns_empty(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
extract_gallery_images
|
extract_gallery_images
|
||||||
|
|
||||||
assert extract_gallery_images(self._README, "") == []
|
assert extract_gallery_images(self._README, "") == []
|
||||||
|
|
||||||
def test_no_widget_returns_empty(self):
|
def test_no_widget_returns_empty(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
extract_gallery_images
|
extract_gallery_images
|
||||||
|
|
||||||
md = "---\ntags:\n - lora\n---\n\nContent"
|
md = "---\ntags:\n - lora\n---\n\nContent"
|
||||||
assert extract_gallery_images(md, self._REPO) == []
|
assert extract_gallery_images(md, self._REPO) == []
|
||||||
|
|
||||||
def test_extract_repo_from_hf_url(self):
|
def test_extract_repo_from_hf_url(self):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
extract_repo_from_hf_url
|
extract_repo_from_hf_url
|
||||||
|
|
||||||
assert extract_repo_from_hf_url(
|
assert extract_repo_from_hf_url(
|
||||||
@@ -568,8 +564,10 @@ base_model: flux
|
|||||||
assert extract_repo_from_hf_url("not a url") == ""
|
assert extract_repo_from_hf_url("not a url") == ""
|
||||||
|
|
||||||
def test_plain_yaml_scalar_text(self):
|
def test_plain_yaml_scalar_text(self):
|
||||||
"""Unquoted multi-line YAML scalar (plain format) should extract prompt."""
|
"""Unquoted multi-line YAML scalar (plain format) extracts first line only.
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
The YAML parser only reports the value on the ``text:`` line; continuation
|
||||||
|
lines are handled by the post-processor from the raw README."""
|
||||||
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
extract_gallery_images
|
extract_gallery_images
|
||||||
|
|
||||||
md = """---
|
md = """---
|
||||||
@@ -581,8 +579,7 @@ widget:
|
|||||||
---"""
|
---"""
|
||||||
images = extract_gallery_images(md, "user/repo")
|
images = extract_gallery_images(md, "user/repo")
|
||||||
assert len(images) == 1
|
assert len(images) == 1
|
||||||
assert "two samurais doing a muay thai fight" in images[0]["meta"]["prompt"]
|
assert images[0]["meta"]["prompt"] == "two samurais doing a muay thai fight"
|
||||||
assert "Textured abstract style" in images[0]["meta"]["prompt"]
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================================
|
# ======================================================================
|
||||||
@@ -603,7 +600,7 @@ class TestExtractGalleryTableImages:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract(md: str, repo: str = _REPO, existing: set | None = None):
|
def _extract(md: str, repo: str = _REPO, existing: set | None = None):
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
extract_gallery_table_images
|
extract_gallery_table_images
|
||||||
return extract_gallery_table_images(md, repo, existing_urls=existing)
|
return extract_gallery_table_images(md, repo, existing_urls=existing)
|
||||||
|
|
||||||
@@ -647,7 +644,7 @@ class TestCleanReadmeForLlm:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _clean(md: str, max_length: int = 6000) -> str:
|
def _clean(md: str, max_length: int = 6000) -> str:
|
||||||
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
|
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
|
||||||
clean_readme_for_llm
|
clean_readme_for_llm
|
||||||
return clean_readme_for_llm(md, max_length=max_length)
|
return clean_readme_for_llm(md, max_length=max_length)
|
||||||
|
|
||||||
@@ -665,10 +662,9 @@ class TestCleanReadmeForLlm:
|
|||||||
|
|
||||||
# -- widget section stripping -------------------------------------------
|
# -- widget section stripping -------------------------------------------
|
||||||
|
|
||||||
def test_widget_text_preserved_in_cleaned_output(self):
|
def test_widget_stripped_frontmatter_metadata_preserved(self):
|
||||||
"""Widget section text is preserved — it provides useful signal
|
"""Widget section is stripped, but ``base_model``, ``tags``,
|
||||||
for tag and description extraction (example prompts describe what
|
``instance_prompt`` survive."""
|
||||||
the model generates)."""
|
|
||||||
md = """---
|
md = """---
|
||||||
tags:
|
tags:
|
||||||
- lora
|
- lora
|
||||||
@@ -689,11 +685,10 @@ instance_prompt: trigger word
|
|||||||
This is the actual content.
|
This is the actual content.
|
||||||
"""
|
"""
|
||||||
result = self._clean(md)
|
result = self._clean(md)
|
||||||
# Widget text content preserved (valuable signal for tags)
|
# Widget text stripped (it's handled by the post-processor gallery
|
||||||
# YAML folded scalars (``>-``) may split text across lines
|
# extraction instead)
|
||||||
assert "a test prompt" in result
|
assert "a test prompt" not in result
|
||||||
assert "another long" in result
|
assert "another long" not in result
|
||||||
assert "prompt here" in result
|
|
||||||
# Non-widget frontmatter preserved
|
# Non-widget frontmatter preserved
|
||||||
assert "base_model: black-forest-labs/FLUX.1-dev" in result
|
assert "base_model: black-forest-labs/FLUX.1-dev" in result
|
||||||
assert "instance_prompt: trigger word" in result
|
assert "instance_prompt: trigger word" in result
|
||||||
@@ -703,7 +698,7 @@ This is the actual content.
|
|||||||
assert "Model Description" in result
|
assert "Model Description" in result
|
||||||
|
|
||||||
def test_widget_last_key_in_frontmatter(self):
|
def test_widget_last_key_in_frontmatter(self):
|
||||||
"""Widget text at end of frontmatter is preserved."""
|
"""Widget stripped, non-widget keys preserved."""
|
||||||
md = """---
|
md = """---
|
||||||
tags:
|
tags:
|
||||||
- lora
|
- lora
|
||||||
@@ -715,7 +710,7 @@ widget:
|
|||||||
# Content
|
# Content
|
||||||
"""
|
"""
|
||||||
result = self._clean(md)
|
result = self._clean(md)
|
||||||
assert "prompt" in result
|
assert "prompt" not in result
|
||||||
assert "tags:" in result
|
assert "tags:" in result
|
||||||
|
|
||||||
def test_no_widget_untouched(self):
|
def test_no_widget_untouched(self):
|
||||||
@@ -798,12 +793,13 @@ pixel art sprite, game asset, transparent background
|
|||||||
|
|
||||||
# -- standalone image stripping ------------------------------------------
|
# -- standalone image stripping ------------------------------------------
|
||||||
|
|
||||||
def test_standalone_image_stripped(self):
|
def test_standalone_image_urls_preserved_for_llm(self):
|
||||||
|
"""Markdown image URLs are kept so the LLM can extract a ``preview_url``."""
|
||||||
md = "## Gallery\n\n\n\nSome text."
|
md = "## Gallery\n\n\n\nSome text."
|
||||||
result = self._clean(md)
|
result = self._clean(md)
|
||||||
assert "cdn.hf.co" not in result
|
# URLs preserved for LLM preview extraction
|
||||||
assert "sample" in result # alt text preserved
|
assert "cdn.hf.co/img.png" in result
|
||||||
assert "another" in result # alt text preserved
|
assert "cdn.hf.co/img2.png" in result
|
||||||
assert "## Gallery" in result
|
assert "## Gallery" in result
|
||||||
assert "Some text." in result
|
assert "Some text." in result
|
||||||
|
|
||||||
@@ -1001,10 +997,10 @@ Weights for this model are available in Safetensors format.
|
|||||||
original_len = len(md)
|
original_len = len(md)
|
||||||
result = self._clean(md)
|
result = self._clean(md)
|
||||||
|
|
||||||
# Still significantly smaller (widget text is kept but training
|
# Significantly smaller: widget + training tables + code blocks
|
||||||
# tables, code blocks, boilerplate are stripped)
|
# + boilerplate all stripped
|
||||||
assert len(result) < original_len * 0.7, (
|
assert len(result) < original_len * 0.35, (
|
||||||
f"Expected <70% of original, got {len(result)}/{original_len}"
|
f"Expected <35% of original, got {len(result)}/{original_len}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Signal preserved
|
# Signal preserved
|
||||||
@@ -1013,9 +1009,8 @@ Weights for this model are available in Safetensors format.
|
|||||||
assert "3D" in result
|
assert "3D" in result
|
||||||
assert "Toon" in result
|
assert "Toon" in result
|
||||||
|
|
||||||
# Widget content preserved (text is valuable signal for tags/desc)
|
# Widget content stripped (post-processor handles image extraction)
|
||||||
assert "close-up of a cartoon character face" in result
|
assert "close-up of a cartoon character face" not in result
|
||||||
assert "Super Detail" in result
|
|
||||||
|
|
||||||
# Noise stripped
|
# Noise stripped
|
||||||
assert "import torch" not in result
|
assert "import torch" not in result
|
||||||
|
|||||||
489
tests/agent_cli/test_readme_processor.py
Normal file
489
tests/agent_cli/test_readme_processor.py
Normal file
@@ -0,0 +1,489 @@
|
|||||||
|
"""Tests for ``readme_processor.py`` — HF README processing for enrich_hf_metadata.
|
||||||
|
|
||||||
|
Import via ``importlib`` to avoid the ``folder_paths`` dependency in
|
||||||
|
``py.services.agent.__init__``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib.util
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
_MODULE_PATH = Path(__file__).parents[2] / "py" / "services" / "agent" / "skills" / "enrich_hf_metadata" / "readme_processor.py"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def R():
|
||||||
|
"""Load the ``readme_processor`` module once per session."""
|
||||||
|
spec = importlib.util.spec_from_file_location("readme_processor", str(_MODULE_PATH))
|
||||||
|
mod = importlib.util.module_from_spec(spec)
|
||||||
|
spec.loader.exec_module(mod)
|
||||||
|
return mod
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# extract_gallery_images
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractGalleryImages:
|
||||||
|
def test_empty(self, R):
|
||||||
|
assert R.extract_gallery_images("", "repo") == []
|
||||||
|
assert R.extract_gallery_images("no frontmatter", "repo") == []
|
||||||
|
|
||||||
|
def test_no_widget(self, R):
|
||||||
|
readme = "---\ntags: [test]\n---\nbody"
|
||||||
|
assert R.extract_gallery_images(readme, "repo") == []
|
||||||
|
|
||||||
|
def test_widget_simple_text(self, R):
|
||||||
|
"""YAML ``text: 'plain'`` → extracted as-is."""
|
||||||
|
readme = """---
|
||||||
|
widget:
|
||||||
|
- text: 'a cute cat'
|
||||||
|
output:
|
||||||
|
url: images/cat.png
|
||||||
|
---"""
|
||||||
|
imgs = R.extract_gallery_images(readme, "user/repo")
|
||||||
|
assert len(imgs) == 1
|
||||||
|
assert imgs[0]["meta"]["prompt"] == "a cute cat"
|
||||||
|
assert "images/cat.png" in imgs[0]["url"]
|
||||||
|
|
||||||
|
def test_widget_unquoted_text(self, R):
|
||||||
|
"""YAML ``text: plain value`` without quotes."""
|
||||||
|
readme = """---
|
||||||
|
widget:
|
||||||
|
- text: simple text
|
||||||
|
output:
|
||||||
|
url: img.png
|
||||||
|
---"""
|
||||||
|
imgs = R.extract_gallery_images(readme, "user/repo")
|
||||||
|
assert len(imgs) == 1
|
||||||
|
assert imgs[0]["meta"]["prompt"] == "simple text"
|
||||||
|
|
||||||
|
def test_widget_block_scalar(self, R):
|
||||||
|
"""YAML ``text: >-`` folded block scalar — extract actual content."""
|
||||||
|
readme = """---
|
||||||
|
widget:
|
||||||
|
- text: >-
|
||||||
|
Long toons, a close-up of a cartoon characters face is featured in a
|
||||||
|
vibrant red backdrop.
|
||||||
|
output:
|
||||||
|
url: images/LT4.png
|
||||||
|
---"""
|
||||||
|
imgs = R.extract_gallery_images(readme, "user/repo")
|
||||||
|
assert len(imgs) == 1
|
||||||
|
prompt = imgs[0]["meta"]["prompt"]
|
||||||
|
assert "Long toons" in prompt
|
||||||
|
assert "vibrant red backdrop" in prompt
|
||||||
|
assert prompt != ">-"
|
||||||
|
|
||||||
|
def test_widget_dash_prefix_output(self, R):
|
||||||
|
"""YAML ``- output:`` (dash prefix) — regression for widget parsing."""
|
||||||
|
readme = """---
|
||||||
|
widget:
|
||||||
|
- output:
|
||||||
|
url: images/test.png
|
||||||
|
text: dash test
|
||||||
|
---"""
|
||||||
|
imgs = R.extract_gallery_images(readme, "user/repo")
|
||||||
|
assert len(imgs) == 1
|
||||||
|
assert imgs[0]["meta"]["prompt"] == "dash test"
|
||||||
|
assert "images/test.png" in imgs[0]["url"]
|
||||||
|
|
||||||
|
def test_widget_mixed_entries(self, R):
|
||||||
|
"""Multiple widget entries with different text styles."""
|
||||||
|
readme = """---
|
||||||
|
widget:
|
||||||
|
- text: >-
|
||||||
|
First entry description.
|
||||||
|
output:
|
||||||
|
url: img1.png
|
||||||
|
- text: second entry
|
||||||
|
output:
|
||||||
|
url: img2.png
|
||||||
|
- text: 'third entry'
|
||||||
|
output:
|
||||||
|
url: img3.png
|
||||||
|
---"""
|
||||||
|
imgs = R.extract_gallery_images(readme, "user/repo")
|
||||||
|
assert len(imgs) == 3
|
||||||
|
assert imgs[0]["meta"]["prompt"] == "First entry description."
|
||||||
|
assert imgs[1]["meta"]["prompt"] == "second entry"
|
||||||
|
assert imgs[2]["meta"]["prompt"] == "third entry"
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# extract_simple_markdown_images
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractSimpleMarkdownImages:
|
||||||
|
def test_empty(self, R):
|
||||||
|
assert R.extract_simple_markdown_images("", "repo") == []
|
||||||
|
|
||||||
|
def test_basic_markdown_image(self, R):
|
||||||
|
"""```` → absolute URL."""
|
||||||
|
imgs = R.extract_simple_markdown_images("", "u/r")
|
||||||
|
assert len(imgs) == 1
|
||||||
|
assert "image_0.png" in imgs[0]["url"]
|
||||||
|
assert imgs[0]["meta"]["prompt"] == "test"
|
||||||
|
|
||||||
|
def test_absolute_url(self, R):
|
||||||
|
"""```` → keep as-is."""
|
||||||
|
imgs = R.extract_simple_markdown_images(
|
||||||
|
"", "u/r"
|
||||||
|
)
|
||||||
|
assert len(imgs) == 1
|
||||||
|
assert imgs[0]["url"] == "https://example.com/img.png"
|
||||||
|
|
||||||
|
def test_skips_code_fences(self, R):
|
||||||
|
"""Inside ``` blocks should be ignored."""
|
||||||
|
text = """outside
|
||||||
|
```
|
||||||
|

|
||||||
|
```
|
||||||
|
outside again
|
||||||
|
"""
|
||||||
|
imgs = R.extract_simple_markdown_images(text, "u/r")
|
||||||
|
assert len(imgs) == 1
|
||||||
|
assert "valid.png" in imgs[0]["url"]
|
||||||
|
|
||||||
|
def test_deduplicates(self, R):
|
||||||
|
text = "\n"
|
||||||
|
imgs = R.extract_simple_markdown_images(text, "u/r")
|
||||||
|
assert len(imgs) == 1 # deduplicated
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# extract_html_img_tags
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractHtmlImgTags:
|
||||||
|
def test_double_quoted_src(self, R):
|
||||||
|
imgs = R.extract_html_img_tags('<img src="./img.png">', "u/r")
|
||||||
|
assert len(imgs) == 1
|
||||||
|
assert "img.png" in imgs[0]["url"]
|
||||||
|
|
||||||
|
def test_single_quoted_src(self, R):
|
||||||
|
imgs = R.extract_html_img_tags("<img src='./img.png'>", "u/r")
|
||||||
|
assert len(imgs) == 1
|
||||||
|
assert "img.png" in imgs[0]["url"]
|
||||||
|
|
||||||
|
def test_absolute_url(self, R):
|
||||||
|
imgs = R.extract_html_img_tags(
|
||||||
|
'<img src="https://cdn.example.com/img.png">', "u/r"
|
||||||
|
)
|
||||||
|
assert len(imgs) == 1
|
||||||
|
assert imgs[0]["url"] == "https://cdn.example.com/img.png"
|
||||||
|
|
||||||
|
def test_deduplicates_across_formats(self, R):
|
||||||
|
text = '<img src="./img.png">\n<img src=\'./img.png\'>'
|
||||||
|
imgs = R.extract_html_img_tags(text, "u/r")
|
||||||
|
assert len(imgs) == 1
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# extract_gallery_table_images
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractGalleryTableImages:
|
||||||
|
def test_gallery_table(self, R):
|
||||||
|
text = """| Preview | Prompt |
|
||||||
|
|--------|--------|
|
||||||
|
|  | a cat |
|
||||||
|
|  | a dog |"""
|
||||||
|
imgs = R.extract_gallery_table_images(text, "u/r")
|
||||||
|
assert len(imgs) == 2
|
||||||
|
assert imgs[0]["meta"]["prompt"] == "a cat"
|
||||||
|
assert "a.png" in imgs[0]["url"]
|
||||||
|
assert imgs[1]["meta"]["prompt"] == "a dog"
|
||||||
|
|
||||||
|
def test_skips_non_gallery_table(self, R):
|
||||||
|
text = """| Parameter | Value |
|
||||||
|
|----------|-------|
|
||||||
|
| Steps | 4 |"""
|
||||||
|
imgs = R.extract_gallery_table_images(text, "u/r")
|
||||||
|
assert len(imgs) == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# clean_readme_for_llm + strip helpers
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestCleanReadmeForLlm:
|
||||||
|
def test_preserves_plain_code_block(self, R):
|
||||||
|
"""`` ``` `` without language tag → preserved (trigger words)."""
|
||||||
|
text = """Before
|
||||||
|
```
|
||||||
|
pixel art sprite, game asset
|
||||||
|
```
|
||||||
|
After"""
|
||||||
|
cleaned = R.clean_readme_for_llm(text)
|
||||||
|
assert "pixel art sprite" in cleaned
|
||||||
|
assert "game asset" in cleaned
|
||||||
|
|
||||||
|
def test_strips_fenced_code_with_lang(self, R):
|
||||||
|
"""`` ```python `` → stripped."""
|
||||||
|
text = "before\n```python\nimport torch\n```\nafter"
|
||||||
|
cleaned = R.clean_readme_for_llm(text)
|
||||||
|
assert "import torch" not in cleaned
|
||||||
|
assert "before" in cleaned
|
||||||
|
assert "after" in cleaned
|
||||||
|
|
||||||
|
def test_preserves_markdown_image_url(self, R):
|
||||||
|
"""```` → URL kept for LLM preview extraction."""
|
||||||
|
text = ""
|
||||||
|
cleaned = R.clean_readme_for_llm(text)
|
||||||
|
assert "./preview.png" in cleaned
|
||||||
|
|
||||||
|
def test_strips_html_img_tag(self, R):
|
||||||
|
"""``<img src="...">`` → stripped."""
|
||||||
|
text = 'before\n<img src="logo.png">\nafter'
|
||||||
|
cleaned = R.clean_readme_for_llm(text)
|
||||||
|
assert "logo.png" not in cleaned
|
||||||
|
|
||||||
|
def test_widget_stripped_frontmatter_preserved(self, R):
|
||||||
|
"""Widget YAML stripped but ``base_model:`` kept."""
|
||||||
|
text = """---
|
||||||
|
tags: [test]
|
||||||
|
widget:
|
||||||
|
- text: >-
|
||||||
|
long description here
|
||||||
|
output:
|
||||||
|
url: img.png
|
||||||
|
base_model: black-forest-labs/FLUX.1-dev
|
||||||
|
instance_prompt: test
|
||||||
|
---"""
|
||||||
|
cleaned = R.clean_readme_for_llm(text)
|
||||||
|
assert "widget:" not in cleaned
|
||||||
|
assert "black-forest-labs/FLUX.1-dev" in cleaned
|
||||||
|
assert "instance_prompt: test" in cleaned
|
||||||
|
|
||||||
|
def test_training_table_stripped(self, R):
|
||||||
|
"""Training-parameter table → stripped."""
|
||||||
|
text = """before
|
||||||
|
| LR Scheduler | constant |
|
||||||
|
|--------------|---------|
|
||||||
|
| Optimizer | AdamW |
|
||||||
|
after"""
|
||||||
|
cleaned = R.clean_readme_for_llm(text)
|
||||||
|
assert "LR Scheduler" not in cleaned
|
||||||
|
assert "Optimizer" not in cleaned
|
||||||
|
assert "before" in cleaned
|
||||||
|
assert "after" in cleaned
|
||||||
|
|
||||||
|
def test_best_dimensions_table_kept(self, R):
|
||||||
|
"""Non-training table (Best Dimensions) → kept."""
|
||||||
|
text = """## Best Dimensions
|
||||||
|
- 768 x 1024 (Best)
|
||||||
|
- 1024 x 1024 (Default)"""
|
||||||
|
cleaned = R.clean_readme_for_llm(text)
|
||||||
|
assert "768 x 1024" in cleaned
|
||||||
|
|
||||||
|
def test_boilerplate_section_stripped(self, R):
|
||||||
|
text = """stuff
|
||||||
|
## Download model
|
||||||
|
[link](url)
|
||||||
|
## Next section
|
||||||
|
content"""
|
||||||
|
cleaned = R.clean_readme_for_llm(text)
|
||||||
|
assert "Download model" not in cleaned
|
||||||
|
assert "Next section" in cleaned
|
||||||
|
assert "content" in cleaned
|
||||||
|
|
||||||
|
def test_returns_empty_for_none(self, R):
|
||||||
|
assert R.clean_readme_for_llm(None) == ""
|
||||||
|
|
||||||
|
def test_returns_empty_for_empty(self, R):
|
||||||
|
assert R.clean_readme_for_llm("") == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# _is_heading / _heading_level
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestHeadingDetection:
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"line,expected",
|
||||||
|
[
|
||||||
|
("# Title", 1),
|
||||||
|
("## Sub", 2),
|
||||||
|
("### Subsub", 3),
|
||||||
|
("#### Subsubsub", 4),
|
||||||
|
("<h1>Title</h1>", 1),
|
||||||
|
("<h2>Sub</h2>", 2),
|
||||||
|
("<h3 class='x'>Sub</h3>", 3),
|
||||||
|
("<h4 id='y'>Sub</h4>", 4),
|
||||||
|
("not a heading", 0),
|
||||||
|
("###", 0), # no text after ###
|
||||||
|
("</h2>", 0), # closing tag, not a heading
|
||||||
|
("", 0),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_heading_level(self, R, line, expected):
|
||||||
|
assert R._heading_level(line) == expected
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"line,expected",
|
||||||
|
[
|
||||||
|
("# Title", True),
|
||||||
|
("<h2>Sub</h2>", True),
|
||||||
|
("</h2>", False), # closing tag
|
||||||
|
("not heading", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_is_heading(self, R, line, expected):
|
||||||
|
assert R._is_heading(line) == expected
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# extract_relevant_section
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractRelevantSection:
|
||||||
|
def test_fallback_full_readme(self, R):
|
||||||
|
"""No match → full README returned."""
|
||||||
|
readme = "# Title\n\nsome content"
|
||||||
|
assert R.extract_relevant_section(readme, "nonexistent") == readme
|
||||||
|
|
||||||
|
def test_empty_basename_returns_full(self, R):
|
||||||
|
readme = "# Title"
|
||||||
|
assert R.extract_relevant_section(readme, "") == readme
|
||||||
|
|
||||||
|
def test_match_heading_includes_yaml(self, R):
|
||||||
|
"""Matching heading should still include YAML frontmatter."""
|
||||||
|
readme = """---
|
||||||
|
base_model: foo
|
||||||
|
---
|
||||||
|
# My-Model-Title
|
||||||
|
|
||||||
|
content
|
||||||
|
## Subsection
|
||||||
|
more"""
|
||||||
|
section = R.extract_relevant_section(readme, "My-Model")
|
||||||
|
assert "base_model: foo" in section
|
||||||
|
assert "content" in section
|
||||||
|
assert "Subsection" in section
|
||||||
|
|
||||||
|
def test_match_heading_includes_subheadings(self, R):
|
||||||
|
"""``# Title`` match includes all ``##`` children."""
|
||||||
|
readme = """# Main Title
|
||||||
|
|
||||||
|
## Child A
|
||||||
|
content A
|
||||||
|
## Child B
|
||||||
|
content B
|
||||||
|
## Child C
|
||||||
|
content C"""
|
||||||
|
section = R.extract_relevant_section(readme, "Main Title")
|
||||||
|
assert "Child A" in section
|
||||||
|
assert "Child B" in section
|
||||||
|
assert "Child C" in section
|
||||||
|
|
||||||
|
def test_match_download_link(self, R):
|
||||||
|
"""Download link containing basename → section extracted."""
|
||||||
|
readme = """# Collection
|
||||||
|
## Model A
|
||||||
|
[Download](./model_a.safetensors)
|
||||||
|
## MyModel
|
||||||
|
[Download](./mymodel.safetensors)
|
||||||
|
content here
|
||||||
|
## Model B
|
||||||
|
other"""
|
||||||
|
section = R.extract_relevant_section(readme, "mymodel")
|
||||||
|
assert "content here" in section
|
||||||
|
assert "Model A" not in section # should not include sibling
|
||||||
|
|
||||||
|
def test_heading_closing_tag_not_boundary(self, R):
|
||||||
|
"""``</h2>`` should NOT be treated as a section boundary."""
|
||||||
|
readme = """# Title
|
||||||
|
<p>some text</p>
|
||||||
|
</h2>
|
||||||
|
## Real Section
|
||||||
|
content"""
|
||||||
|
section = R.extract_relevant_section(readme, "Title")
|
||||||
|
assert "Real Section" in section # forward walk should not stop at </h2>
|
||||||
|
assert "content" in section
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# _extract_frontmatter
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractFrontmatter:
|
||||||
|
def test_basic(self, R):
|
||||||
|
assert R._extract_frontmatter("---\ntags: [a]\n---\nbody") == "\ntags: [a]\n"
|
||||||
|
|
||||||
|
def test_no_frontmatter(self, R):
|
||||||
|
assert R._extract_frontmatter("no dashes") == ""
|
||||||
|
|
||||||
|
def test_empty_string(self, R):
|
||||||
|
assert R._extract_frontmatter("") == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# _strip_widget_section
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestStripWidgetSection:
|
||||||
|
def test_strip_widget_keep_base_model(self, R):
|
||||||
|
"""Widget stripped but ``base_model:`` preserved."""
|
||||||
|
text = """---
|
||||||
|
tags: [test]
|
||||||
|
widget:
|
||||||
|
- text: >-
|
||||||
|
long text
|
||||||
|
output:
|
||||||
|
url: img.png
|
||||||
|
base_model: black-forest-labs/FLUX.1-dev
|
||||||
|
---"""
|
||||||
|
result = R._strip_widget_section(text)
|
||||||
|
assert "widget:" not in result
|
||||||
|
assert "black-forest-labs/FLUX.1-dev" in result
|
||||||
|
|
||||||
|
def test_no_widget_no_change(self, R):
|
||||||
|
text = "---\ntags: [a]\n---"
|
||||||
|
assert R._strip_widget_section(text) == text
|
||||||
|
|
||||||
|
def test_widget_at_end_of_frontmatter(self, R):
|
||||||
|
"""Widget is the last YAML key before closing ---."""
|
||||||
|
text = """---
|
||||||
|
base_model: a
|
||||||
|
widget:
|
||||||
|
- text: x
|
||||||
|
output:
|
||||||
|
url: y.png
|
||||||
|
---"""
|
||||||
|
result = R._strip_widget_section(text)
|
||||||
|
assert "widget:" not in result
|
||||||
|
assert "base_model: a" in result
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================
|
||||||
|
# _strip_fenced_code_blocks
|
||||||
|
# ======================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TestStripFencedCodeBlocks:
|
||||||
|
def test_strips_with_language(self, R):
|
||||||
|
text = "a\n```python\ncode\n```\nb"
|
||||||
|
assert R._strip_fenced_code_blocks(text) == "a\nb"
|
||||||
|
|
||||||
|
def test_keeps_plain_fence(self, R):
|
||||||
|
"""`` ``` `` without language → preserved."""
|
||||||
|
text = "a\n```\ntrigger words\n```\nb"
|
||||||
|
assert "trigger words" in R._strip_fenced_code_blocks(text)
|
||||||
|
|
||||||
|
def test_pattern(self, R):
|
||||||
|
text = "x\n```yaml\nkey: val\n```\ny"
|
||||||
|
assert "key: val" not in R._strip_fenced_code_blocks(text)
|
||||||
@@ -128,7 +128,7 @@ def generate_optimisation_suggestions(
|
|||||||
if prev and prev.get("empty_rate_pct", 0) > 50:
|
if prev and prev.get("empty_rate_pct", 0) > 50:
|
||||||
suggestions.append(
|
suggestions.append(
|
||||||
"- **预览图下载成功率低 ({:.0f}%)**: 很多 HF 模型卡没有 embed 图片(仅使用 YAML widget "
|
"- **预览图下载成功率低 ({:.0f}%)**: 很多 HF 模型卡没有 embed 图片(仅使用 YAML widget "
|
||||||
"或 external link)。当前 `md_to_html.py` 的 `extract_gallery_images` 和 "
|
"或 external link)。当前 `readme_processor.py` 的 `extract_gallery_images` 和 "
|
||||||
"`extract_gallery_table_images` 已覆盖了多数场景。若预览图不重要,可降低此字段权重。".format(
|
"`extract_gallery_table_images` 已覆盖了多数场景。若预览图不重要,可降低此字段权重。".format(
|
||||||
prev.get("empty_rate_pct", 0)
|
prev.get("empty_rate_pct", 0)
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user