Bjornulf_custom_nodes/openai_nodes.py

import os
import base64
import io
import json
import numpy as np
import torch
from PIL import Image
from openai import OpenAI

class OpenAIVisionNode:
    """
    ComfyUI node for OpenAI's Vision API processing
    """

    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {
                "image": ("IMAGE",),
                "prompt": ("STRING", {"multiline": True, "default": "Output one line, exactly 7 lowercase fields, separated by semicolons, no spaces:\nsex;race;age;hair_length;body;eye_wear;head_wear\n\nField values:\nsex: male/female\nrace: pale/caucasian/hispanic/black/asian\nage: adult/child\nhair_length: none/long\nbody: average/skinny/fat/obese/muscular\neye_wear: none/glasses\nhead_wear: none/hat/cap\nExamples:\nfemale;black;unknown;long;fat;none;none\nmale;hispanic;unknown;none;muscular;glasses;cap\nmale;asian;unknown;none;kid;none;hat"}),
                "model": (["GPT-4.1 ($2.00/$8.00 per 1M tokens)",
                          "GPT-4.1 mini ($0.40/$1.60 per 1M tokens)",
                          "GPT-4.1 nano ($0.10/$0.40 per 1M tokens)"],),
                "api_key": ("STRING", {"default": "", "multiline": False})
            }
        }

    RETURN_TYPES = ("STRING",)
    RETURN_NAMES = ("analysis",)
    FUNCTION = "analyze_image"
    CATEGORY = "Bjornulf"

    def analyze_image(self, image, prompt, model, api_key=""):
        """Process the image with OpenAI's Vision API"""
        # Get API key from environment if not provided
        if not api_key:
            api_key = os.environ.get('OPENAI_API_KEY')
            if not api_key:
                return ("No OpenAI API key provided. Please enter your API key or set the OPENAI_API_KEY environment variable.",)

        # Map selected model to actual model identifier
        model_mapping = {
            "GPT-4.1 ($2.00/$8.00 per 1M tokens)": "gpt-4.1",
            "GPT-4.1 mini ($0.40/$1.60 per 1M tokens)": "gpt-4.1-mini",
            "GPT-4.1 nano ($0.10/$0.40 per 1M tokens)": "gpt-4.1-nano"
        }

        model_id = model_mapping[model]

        try:
            # ComfyUI images are in BCHW format with float values [0,1]
            # Extract the first image if we have a batch
            if len(image.shape) == 4:
                image_tensor = image[0]  # Get the first image from the batch
            else:
                image_tensor = image

            # Convert from BCHW/CHW to HWC format for PIL
            if image_tensor.shape[0] in [1, 3, 4]:  # If first dimension is channels
                image_tensor = image_tensor.permute(1, 2, 0)

            # Convert to numpy and scale to 0-255 range
            np_image = image_tensor.cpu().numpy()
            if np_image.max() <= 1.0:
                np_image = (np_image * 255).astype(np.uint8)

            # Handle different channel configurations
            if np_image.shape[2] == 1:  # Grayscale
                np_image = np.repeat(np_image, 3, axis=2)
            elif np_image.shape[2] == 4:  # RGBA
                np_image = np_image[:, :, :3]  # Remove alpha channel

            # Create PIL image from numpy array
            pil_image = Image.fromarray(np_image)

            # Encode the PIL image to base64
            image_bytes = io.BytesIO()
            pil_image.save(image_bytes, format='PNG')
            image_bytes.seek(0)
            base64_image = base64.b64encode(image_bytes.getvalue()).decode('utf-8')

            # Create OpenAI client
            client = OpenAI(api_key=api_key)

            # Create completion with the Vision API
            response = client.responses.create(
                model=model_id,
                input=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "input_text",
                                "text": prompt,
                            },
                            {
                                "type": "input_image",
                                "image_url": f"data:image/png;base64,{base64_image}",
                            },
                        ],
                    }
                ]
            )

            analysis = response.output_text.strip()
            return (analysis,)

        except Exception as e:
            import traceback
            error_details = traceback.format_exc()
            return (f"Error processing image: {str(e)}\n\nDetails: {error_details}",)