"""Centralized LLM API client with BYOK (bring-your-own-key) provider support. Reads provider configuration from :class:`SettingsManager` and makes OpenAI-compatible ``/chat/completions`` calls. Supports any provider that implements the OpenAI Chat Completions API surface area (OpenAI, Ollama, vLLM, LM Studio, etc.). """ from __future__ import annotations import asyncio import json import logging from typing import Any, Dict, List, Optional import aiohttp from .errors import LLMNotConfiguredError, LLMRateLimitError, LLMResponseError logger = logging.getLogger(__name__) # Default API base URLs per provider _PROVIDER_DEFAULTS: Dict[str, str] = { "openai": "https://api.openai.com/v1", "ollama": "http://localhost:11434/v1", # "custom" requires an explicit llm_api_base from the user } # Request timeout for LLM calls (seconds) _LLM_TIMEOUT = aiohttp.ClientTimeout(total=120) class LLMService: """Centralized LLM API client. All agent skills call LLMs through this service so that BYOK config, retry logic, and error handling live in one place. """ _instance: Optional["LLMService"] = None _lock: asyncio.Lock = asyncio.Lock() def __init__(self, settings_service) -> None: self._settings = settings_service # ------------------------------------------------------------------ # Singleton access # ------------------------------------------------------------------ @classmethod async def get_instance(cls) -> "LLMService": """Return the lazily-initialised global ``LLMService`` instance.""" if cls._instance is None: async with cls._lock: if cls._instance is None: from .settings_manager import get_settings_manager cls._instance = cls(get_settings_manager()) return cls._instance @classmethod def reset_instance(cls) -> None: """Reset the cached singleton — primarily for tests.""" cls._instance = None # ------------------------------------------------------------------ # Configuration helpers # ------------------------------------------------------------------ def _get_config(self) -> Dict[str, Any]: """Read the current LLM configuration from settings.""" return { "provider": self._settings.get("llm_provider", "openai"), "api_key": self._settings.get("llm_api_key", ""), "api_base": self._settings.get("llm_api_base", ""), "model": self._settings.get("llm_model", ""), } def is_configured(self) -> bool: """Return ``True`` when the LLM provider is minimally configured. A provider is considered configured when ``llm_model`` is set and (for non-Ollama) an API key is configured. """ cfg = self._get_config() has_model = bool(cfg["model"]) has_key = bool(cfg["api_key"]) or cfg["provider"] == "ollama" return has_model and has_key def _resolve_api_base(self, provider: str, api_base: str) -> str: """Resolve the API base URL for the given provider.""" if api_base: return api_base.rstrip("/") return _PROVIDER_DEFAULTS.get(provider, "").rstrip("/") def _build_headers(self, api_key: str) -> Dict[str, str]: """Build HTTP headers for the LLM API request.""" headers = {"Content-Type": "application/json"} if api_key: headers["Authorization"] = f"Bearer {api_key}" return headers def _ensure_configured(self) -> Dict[str, Any]: """Validate configuration and return it, or raise. A provider is considered configured when ``llm_model`` is set and (for non-Ollama) an API key is configured. """ cfg = self._get_config() has_model = bool(cfg["model"]) has_key = bool(cfg["api_key"]) or cfg["provider"] == "ollama" if not (has_model and has_key): parts = [] if not has_model: parts.append("No LLM model specified") if not has_key and cfg["provider"] != "ollama": parts.append("No LLM API key configured") detail = "; ".join(parts) if parts else "LLM provider is not configured" raise LLMNotConfiguredError( f"{detail}. Configure it in Settings → AI Provider." ) return cfg # ------------------------------------------------------------------ # Core API call # ------------------------------------------------------------------ async def chat_completion( self, *, messages: List[Dict[str, str]], model: Optional[str] = None, temperature: float = 0.3, response_format: Optional[Dict[str, Any]] = None, max_tokens: Optional[int] = None, retry_on_rate_limit: bool = True, ) -> Dict[str, Any]: """Call the configured LLM provider's ``/chat/completions`` endpoint. Args: messages: OpenAI-format message list model: Override the configured model name temperature: Sampling temperature response_format: Optional ``{"type": "json_object"}`` for structured output max_tokens: Optional max output tokens retry_on_rate_limit: Retry once after a 429 with backoff Returns: Dict with ``content`` (str), ``usage`` (dict), ``model`` (str) Raises: LLMNotConfiguredError: Provider not enabled / missing config LLMRateLimitError: Rate limited and retry exhausted LLMResponseError: Non-200 response or parse failure """ cfg = self._ensure_configured() api_base = self._resolve_api_base(cfg["provider"], cfg["api_base"]) url = f"{api_base}/chat/completions" model_name = model or cfg["model"] payload: Dict[str, Any] = { "model": model_name, "messages": messages, "temperature": temperature, } if response_format is not None: payload["response_format"] = response_format if max_tokens is not None: payload["max_tokens"] = max_tokens headers = self._build_headers(cfg["api_key"]) attempt = 0 max_attempts = 2 if retry_on_rate_limit else 1 while attempt < max_attempts: attempt += 1 try: async with aiohttp.ClientSession(timeout=_LLM_TIMEOUT) as session: async with session.post( url, json=payload, headers=headers ) as resp: if resp.status == 429: if attempt < max_attempts: retry_after = float( resp.headers.get("Retry-After", "5") ) logger.warning( "LLM rate limited, retrying after %.1fs", retry_after, ) await asyncio.sleep(retry_after) continue raise LLMRateLimitError( f"LLM provider rate limited (HTTP 429)", provider=cfg["provider"], ) if resp.status != 200: body = await resp.text() raise LLMResponseError( f"LLM API returned HTTP {resp.status}: " f"{body[:500]}" ) data = await resp.json() except aiohttp.ClientError as exc: raise LLMResponseError(f"Network error calling LLM API: {exc}") from exc # Parse response try: content = data["choices"][0]["message"]["content"] usage = data.get("usage", {}) return { "content": content, "usage": usage, "model": data.get("model", model_name), } except (KeyError, IndexError) as exc: raise LLMResponseError( f"Unexpected LLM response structure: {json.dumps(data)[:500]}" ) from exc # Should not reach here, but satisfy type checker raise LLMRateLimitError("Rate limit retry exhausted", provider=cfg["provider"]) # ------------------------------------------------------------------ # Structured output convenience # ------------------------------------------------------------------ async def chat_completion_json( self, *, system_prompt: str, user_prompt: str, model: Optional[str] = None, temperature: float = 0.3, max_tokens: Optional[int] = None, ) -> Dict[str, Any]: """Call the LLM and return parsed JSON. Sends ``response_format: {"type": "json_object"}`` when the provider supports it, and parses the response content as JSON. If parsing fails, retries once with a clarifying system message. Args: system_prompt: System-level instructions user_prompt: User-level query model: Override the configured model name temperature: Sampling temperature max_tokens: Optional max output tokens Returns: Parsed JSON dict from the LLM response Raises: LLMNotConfiguredError: Provider not configured LLMRateLimitError: Rate limited LLMResponseError: JSON parse failure after retry """ messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ] # First attempt with JSON mode result = await self.chat_completion( messages=messages, model=model, temperature=temperature, response_format={"type": "json_object"}, max_tokens=max_tokens, ) try: return json.loads(result["content"]) except (json.JSONDecodeError, TypeError) as exc: logger.warning( "LLM JSON parse failed on first attempt: %s. Retrying.", exc ) # Retry with explicit instruction to return valid JSON retry_messages = messages + [ { "role": "assistant", "content": result["content"], }, { "role": "user", "content": ( "The previous response could not be parsed as JSON. " "Please respond with ONLY a valid JSON object, no " "markdown fences or extra text." ), }, ] result = await self.chat_completion( messages=retry_messages, model=model, temperature=0.0, # More deterministic for retry response_format={"type": "json_object"}, max_tokens=max_tokens, ) try: return json.loads(result["content"]) except (json.JSONDecodeError, TypeError) as exc: raise LLMResponseError( f"LLM response could not be parsed as JSON after retry: {exc}\n" f"Raw content: {result['content'][:500]}" ) from exc