Files
ComfyUI-Lora-Manager/py/services/agent/post_processor.py
Will Miao 51c0135250 refactor(agent): rename agent_cli to metadata_ops, strip temp debug logs
- Rename py/agent_cli/ -> py/metadata_ops/ (module was never agent-related)
- Rename tests/agent_cli/ -> tests/metadata_ops/
- Remove 9 low-value/debug INFO log points across agent_handlers.py,
  agent_service.py, llm_service.py, and metadata_ops/__init__.py
- Keep LLM raw response at DEBUG level for diagnostics
- Consolidate per-model progress + LLM result into single concise
  log line with basename instead of full path
- Update package/class/method docstrings to clarify this is a
  pipeline infrastructure, not a true agent loop
2026-07-05 18:00:58 +08:00

337 lines
14 KiB
Python

"""Post-processing engine for skill pipeline outputs.
The :class:`PostProcessor` takes the LLM's structured JSON output and applies
it to a model's on-disk metadata via the :mod:`~py.metadata_ops` functions.
It handles all the skill-specific business logic — conditions, transformations,
and orchestration of multiple side-effects (write metadata, download preview,
refresh cache). All actual I/O is delegated to :mod:`~py.metadata_ops`.
"""
from __future__ import annotations
import json
import logging
import os
import re
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
class PostProcessor:
"""Deterministic post-processor for skill pipeline outputs.
Usage (called by :class:`~py.services.agent.agent_service.AgentService`)::
processor = PostProcessor()
result = await processor.process(
skill_name="enrich_hf_metadata",
model_path="/path/to/model.safetensors",
llm_output={...},
metadata={...}, # from metadata_ops.read_metadata()
)
"""
async def process(
self,
*,
skill_name: str,
model_path: str,
llm_output: Dict[str, Any],
metadata: Dict[str, Any],
readme_content: str = "",
) -> Dict[str, Any]:
"""Route *llm_output* to the correct skill post-processor.
*readme_content* is optional raw markdown content (e.g. HF README)
that is converted to HTML and stored as ``modelDescription`` for
the description tab.
Returns a dict with keys ``success`` (bool), ``updated_fields`` (list),
``preview_downloaded`` (bool), and ``errors`` (list).
"""
if skill_name == "enrich_hf_metadata":
return await self._process_enrich_hf_metadata(
model_path, llm_output, metadata, readme_content,
)
return {
"success": False,
"updated_fields": [],
"errors": [f"No post-processor registered for skill: {skill_name}"],
}
# ------------------------------------------------------------------
# enrich_hf_metadata
# ------------------------------------------------------------------
async def _process_enrich_hf_metadata(
self,
model_path: str,
llm_output: Dict[str, Any],
metadata: Dict[str, Any],
readme_content: str = "",
) -> Dict[str, Any]:
from ...metadata_ops import (
apply_metadata_updates,
download_preview,
refresh_cache,
)
from .skills.enrich_hf_metadata.readme_processor import (
convert_readme_to_html,
extract_gallery_images,
extract_gallery_table_images,
extract_relevant_section,
extract_simple_markdown_images,
extract_html_img_tags,
extract_repo_from_hf_url,
)
updated_fields: List[str] = []
preview_downloaded = False
# -- Determine whether this is an HF-sourced model -----------------
is_hf_model = not metadata.get("from_civitai", True)
# -- Collect updates -----------------------------------------------
updates: Dict[str, Any] = {}
# base_model
new_base = (llm_output.get("base_model") or "").strip()
current_base = metadata.get("base_model", "") or ""
if new_base and self._should_overwrite(current_base, is_hf_model):
updates["base_model"] = new_base
# trigger words → civitai.trainedWords
new_triggers = llm_output.get("trigger_words", [])
trigger_words_empty = True
if isinstance(new_triggers, list):
cleaned = [t.strip() for t in new_triggers if t.strip()]
cleaned = [t for t in cleaned if t.lower() not in ("none", "null", "n/a")]
trigger_words_empty = not cleaned
current_civitai = metadata.get("civitai") or {}
current_triggers = current_civitai.get("trainedWords") or []
if self._should_overwrite_list(current_triggers, is_hf_model):
trig_civitai = dict(current_civitai)
if "civitai" in updates and isinstance(updates["civitai"], dict):
trig_civitai.update(updates["civitai"])
trig_civitai["trainedWords"] = cleaned
updates["civitai"] = trig_civitai
# modelDescription — from raw README content (converted to HTML)
if readme_content and is_hf_model:
converted = convert_readme_to_html(readme_content)
if converted:
updates["modelDescription"] = converted
# short_description → civitai.description (for "About this version")
short_desc = (llm_output.get("short_description") or "").strip()
if short_desc and is_hf_model:
current_civitai = metadata.get("civitai") or {}
desc_civitai = dict(current_civitai)
if "civitai" in updates and isinstance(updates["civitai"], dict):
desc_civitai.update(updates["civitai"])
desc_civitai["description"] = short_desc
updates["civitai"] = desc_civitai
# gallery images → civitai.images (from YAML frontmatter widget entries
# and Sample Gallery markdown tables in the README body)
gallery_images: List[Dict[str, Any]] = []
if readme_content and is_hf_model:
hf_url = metadata.get("hf_url", "") or ""
repo = extract_repo_from_hf_url(hf_url)
if repo:
rec_w = llm_output.get("recommended_width") or 0
rec_h = llm_output.get("recommended_height") or 0
# 1. Widget images (YAML frontmatter)
gallery = extract_gallery_images(
readme_content, repo,
default_width=rec_w, default_height=rec_h,
)
# 2. Sample Gallery table images (markdown body), deduplicated
existing_urls = {img["url"] for img in gallery if img.get("url")}
table_images = extract_gallery_table_images(
readme_content, repo,
existing_urls=existing_urls,
default_width=rec_w, default_height=rec_h,
)
existing_urls.update(img["url"] for img in table_images if img.get("url"))
# 3. Simple markdown images `![alt](url)` in the body
simple_images = extract_simple_markdown_images(
readme_content, repo,
existing_urls=existing_urls,
default_width=rec_w, default_height=rec_h,
)
existing_urls.update(img["url"] for img in simple_images if img.get("url"))
# 4. HTML `<img>` tags (used by many collection repos)
html_images = extract_html_img_tags(
readme_content, repo,
existing_urls=existing_urls,
default_width=rec_w, default_height=rec_h,
)
all_images = gallery + table_images + simple_images + html_images
if all_images:
gallery_images = all_images
current_civitai = metadata.get("civitai") or {}
gallery_civitai = dict(current_civitai)
if "civitai" in updates and isinstance(updates["civitai"], dict):
gallery_civitai.update(updates["civitai"])
gallery_civitai["images"] = all_images
updates["civitai"] = gallery_civitai
# tags
new_tags = llm_output.get("tags", [])
if isinstance(new_tags, list) and new_tags:
existing_tags = metadata.get("tags") or []
merged = self._merge_tags(existing_tags, new_tags)
if len(merged) > len(existing_tags) or is_hf_model:
updates["tags"] = merged
# metadata_source & llm_enriched_at (always set)
updates["metadata_source"] = "agent:enrich_hf_metadata"
updates["llm_enriched_at"] = datetime.now(timezone.utc).isoformat()
# Store LLM confidence in metadata so it's accessible for evaluation
raw_confidence = (llm_output.get("confidence") or "").strip()
if raw_confidence:
updates["_llm_confidence"] = raw_confidence
# Fallback: extract instance_prompt from YAML frontmatter when the LLM
# returned empty trigger words but the README has instance_prompt.
if trigger_words_empty:
instance_prompt = _extract_yaml_instance_prompt(readme_content)
if instance_prompt:
current_civitai = metadata.get("civitai") or {}
trig_civitai = dict(current_civitai)
if "civitai" in updates and isinstance(updates["civitai"], dict):
trig_civitai.update(updates["civitai"])
trig_civitai["trainedWords"] = [instance_prompt]
updates["civitai"] = trig_civitai
preview_remote_url = (llm_output.get("preview_url") or "").strip()
# Fallback: if the LLM couldn't find a preview image in the cleaned
# README, find the first gallery image from the *model-specific
# section* of the README (not the repo-wide first image, which
# belongs to a different model in collection repos).
if not preview_remote_url and readme_content and is_hf_model:
model_basename = os.path.splitext(os.path.basename(model_path))[0]
relevant_section = extract_relevant_section(
readme_content, model_basename,
)
if relevant_section and relevant_section != readme_content:
for img in gallery_images:
img_url = img.get("url", "")
if img_url and img_url in relevant_section:
preview_remote_url = img_url
break
# Last resort: use the first gallery image from the full README.
if not preview_remote_url and gallery_images:
preview_remote_url = gallery_images[0].get("url", "")
current_preview = metadata.get("preview_url") or ""
if preview_remote_url and not (current_preview and os.path.exists(current_preview)):
local_path = await download_preview(model_path, preview_remote_url)
if local_path:
preview_downloaded = True
updates["preview_url"] = local_path
# notes — plain-text summary of usage info from the LLM
new_notes = (llm_output.get("notes") or "").strip()
if new_notes:
updates["notes"] = new_notes
# usage_tips — JSON string (e.g. {"strength_min":0.85,"strength_max":1.4})
raw_tips = (llm_output.get("usage_tips") or "").strip()
if raw_tips and raw_tips != "{}":
try:
json.loads(raw_tips)
updates["usage_tips"] = raw_tips
except (json.JSONDecodeError, TypeError):
logger.warning(
"LLM returned invalid usage_tips JSON: %s", raw_tips[:200]
)
if updates:
updated_fields = await apply_metadata_updates(model_path, updates)
# -- Refresh scanner cache ------------------------------------------
if updated_fields or preview_downloaded:
await refresh_cache(model_path)
return {
"success": True,
"updated_fields": updated_fields,
"preview_downloaded": preview_downloaded,
"updates": updates,
"errors": [],
}
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
@staticmethod
def _should_overwrite(current_value: str, is_hf_model: bool) -> bool:
"""Return ``True`` when a scalar field should be overwritten."""
return is_hf_model or not current_value or current_value.lower() in (
"", "unknown",
)
@staticmethod
def _should_overwrite_list(current_list: List[str], is_hf_model: bool) -> bool:
"""Return ``True`` when a list field should be overwritten."""
return is_hf_model or not current_list
@staticmethod
def _merge_tags(existing: List[str], new: List[str]) -> List[str]:
"""Merge *new* tags into *existing*, all lowercased.
This matches the behaviour of :class:`TagUpdateService` which
normalises every tag to lowercase for case-insensitive dedup.
"""
merged: List[str] = []
seen: set = set()
for tag in list(existing) + list(new):
t = tag.strip().lower()
if t and t not in seen:
merged.append(t)
seen.add(t)
return merged
# ------------------------------------------------------------------
# Module-level helpers
# ------------------------------------------------------------------
def _extract_yaml_instance_prompt(readme_content: str) -> str:
"""Extract ``instance_prompt`` from the YAML frontmatter of a HF README.
Returns the prompt text, or empty string if not found. Handles
``null`` / ``~`` YAML null values by returning empty string.
"""
if not readme_content or not readme_content.startswith("---"):
return ""
# Find end of frontmatter
end = readme_content.find("---", 3)
if end == -1:
return ""
frontmatter = readme_content[3:end]
for line in frontmatter.split("\n"):
line = line.strip()
m = re.match(r"^instance_prompt:\s*(.*)", line)
if m:
val = m.group(1).strip().strip('"').strip("'")
if val.lower() in ("null", "~", "none", ""):
return ""
return val
return ""