feat: add TencentARC PhotoMaker support (#179)

* first efforts at implementing photomaker; lots more to do * added PhotoMakerIDEncoder model in SD * fixed soem bugs; now photomaker model weights can be loaded into their tensor buffers * added input id image loading * added preprocessing inpit id images * finished get_num_tensors * fixed a bug in remove_duplicates * add a get_learned_condition_with_trigger function to do photomaker stuff * add a convert_token_to_id function for photomaker to extract trigger word's token id * making progress; need to implement tokenizer decoder * making more progress; finishing vision model forward * debugging vision_model outputs * corrected clip vision model output * continue making progress in id fusion process * finished stacked id embedding; to be tested * remove garbage file * debuging graph compute * more progress; now alloc buffer failed * fixed wtype issue; input images can only be 1 because issue with transformer when batch size > 1 (to be investigated) * added delayed subject conditioning; now photomaker runs and generates images * fixed stat_merge_step * added photomaker lora model (to be tested) * reworked pmid lora * finished applying pmid lora; to be tested * finalized pmid lora * add a few print tensor; tweak in sample again * small tweak; still not getting ID faces * fixed a bug in FuseBlock forward; also remove diag_mask op in for vision transformer; getting better results * disable pmid lora apply for now; 1 input image seems working; > 1 not working * turn pmid lora apply back on * fixed a decode bug * fixed a bug in ggml's conv_2d, and now > 1 input images working * add style_ratio as a cli param; reworked encode with trigger for attention weights * merge commit fixing lora free param buffer error * change default style ratio to 10% * added an option to offload vae decoder to CPU for mem-limited gpus * removing image normalization step seems making ID fidelity much higher * revert default style ratio back ro 20% * added an option for normalizing input ID images; cleaned up debugging code * more clean up * fixed bugs; now failed with cuda error; likely out-of-mem on GPU * free pmid model params when required * photomaker working properly now after merging and adapting to GGMLBlock API * remove tensor renaming; fixing names in the photomaker model file * updated README.md to include instructions and notes for running PhotoMaker * a bit clean up * remove -DGGML_CUDA_FORCE_MMQ; more clean up and README update * add input image requirement in README * bring back freeing pmid lora params buffer; simply pooled output of CLIPvision * remove MultiheadAttention2; customized MultiheadAttention * added a WIN32 get_files_from_dir; turn off Photomakder if receiving no input images * update docs * fix ci error * make stable-diffusion.h a pure c header file This reverts commit 27887b630db6a92f269f0aef8de9bc9832ab50a9. * fix ci error * format code * reuse get_learned_condition * reuse pad_tokens * reuse CLIPVisionModel * reuse LoraModel * add --clip-on-cpu * fix lora name conversion for SDXL --------- Co-authored-by: bssrdf <bssrdf@gmail.com> Co-authored-by: leejet <leejet714@gmail.com>
2024-03-12 11:15:17 -04:00
parent 61980171a1
commit a469688e30
28 changed files with 3935 additions and 186 deletions
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -10,6 +10,7 @@
 #include "stable-diffusion.h"

 #define STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_STATIC
 #include "stb_image.h"

 #define STB_IMAGE_WRITE_IMPLEMENTATION
@@ -65,6 +66,8 @@ struct SDParams {
    std::string esrgan_path;
    std::string controlnet_path;
    std::string embeddings_path;
+    std::string stacked_id_embeddings_path;
+    std::string input_id_images_path;
    sd_type_t wtype = SD_TYPE_COUNT;
    std::string lora_model_dir;
    std::string output_path = "output.png";
@@ -73,12 +76,13 @@ struct SDParams {

    std::string prompt;
    std::string negative_prompt;
-    float min_cfg   = 1.0f;
-    float cfg_scale = 7.0f;
-    int clip_skip   = -1;  // <= 0 represents unspecified
-    int width       = 512;
-    int height      = 512;
-    int batch_count = 1;
+    float min_cfg     = 1.0f;
+    float cfg_scale   = 7.0f;
+    float style_ratio = 20.f;
+    int clip_skip     = -1;  // <= 0 represents unspecified
+    int width         = 512;
+    int height        = 512;
+    int batch_count   = 1;

    int video_frames         = 6;
    int motion_bucket_id     = 127;
@@ -95,6 +99,9 @@ struct SDParams {
    bool verbose                  = false;
    bool vae_tiling               = false;
    bool control_net_cpu          = false;
+    bool normalize_input          = false;
+    bool clip_on_cpu              = false;
+    bool vae_on_cpu               = false;
    bool canny_preprocess         = false;
    int upscale_repeats           = 1;
 };
@@ -110,10 +117,16 @@ void print_params(SDParams params) {
    printf("    esrgan_path:       %s\n", params.esrgan_path.c_str());
    printf("    controlnet_path:   %s\n", params.controlnet_path.c_str());
    printf("    embeddings_path:   %s\n", params.embeddings_path.c_str());
+    printf("    stacked_id_embeddings_path:   %s\n", params.stacked_id_embeddings_path.c_str());
+    printf("    input_id_images_path:   %s\n", params.input_id_images_path.c_str());
+    printf("    style ratio:       %.2f\n", params.style_ratio);
+    printf("    normzalize input image :  %s\n", params.normalize_input ? "true" : "false");
    printf("    output_path:       %s\n", params.output_path.c_str());
    printf("    init_img:          %s\n", params.input_path.c_str());
    printf("    control_image:     %s\n", params.control_image_path.c_str());
+    printf("    clip on cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
    printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
+    printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
    printf("    strength(control): %.2f\n", params.control_strength);
    printf("    prompt:            %s\n", params.prompt.c_str());
    printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
@@ -146,6 +159,9 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
    printf("  --control-net [CONTROL_PATH]       path to control net model\n");
    printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings.\n");
+    printf("  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings.\n");
+    printf("  --input-id-images-dir [DIR]        path to PHOTOMAKER input id images dir.\n");
+    printf("  --normalize-input                  normalize PHOTOMAKER input id images\n");
    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n");
    printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
    printf("  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
@@ -158,6 +174,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
+    printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%%)\n");
    printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
    printf("                                     1.0 corresponds to full destruction of information in init image\n");
    printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
@@ -244,6 +261,18 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                break;
            }
            params.embeddings_path = argv[i];
+        } else if (arg == "--stacked-id-embd-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.stacked_id_embeddings_path = argv[i];
+        } else if (arg == "--input-id-images-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.input_id_images_path = argv[i];
        } else if (arg == "--type") {
            if (++i >= argc) {
                invalid_arg = true;
@@ -327,6 +356,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                break;
            }
            params.strength = std::stof(argv[i]);
+        } else if (arg == "--style-ratio") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.style_ratio = std::stof(argv[i]);
        } else if (arg == "--control-strength") {
            if (++i >= argc) {
                invalid_arg = true;
@@ -361,6 +396,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
            params.vae_tiling = true;
        } else if (arg == "--control-net-cpu") {
            params.control_net_cpu = true;
+        } else if (arg == "--normalize-input") {
+            params.normalize_input = true;
+        } else if (arg == "--clip-on-cpu") {
+            params.clip_on_cpu = true;  // will slow down get_learned_condiotion but necessary for low MEM GPUs
+        } else if (arg == "--vae-on-cpu") {
+            params.vae_on_cpu = true;  // will slow down latent decoding but necessary for low MEM GPUs
        } else if (arg == "--canny") {
            params.canny_preprocess = true;
        } else if (arg == "-b" || arg == "--batch-count") {
@@ -613,6 +654,7 @@ int main(int argc, const char* argv[]) {
                                  params.controlnet_path.c_str(),
                                  params.lora_model_dir.c_str(),
                                  params.embeddings_path.c_str(),
+                                  params.stacked_id_embeddings_path.c_str(),
                                  vae_decode_only,
                                  params.vae_tiling,
                                  true,
@@ -620,7 +662,9 @@ int main(int argc, const char* argv[]) {
                                  params.wtype,
                                  params.rng_type,
                                  params.schedule,
-                                  params.control_net_cpu);
+                                  params.clip_on_cpu,
+                                  params.control_net_cpu,
+                                  params.vae_on_cpu);

    if (sd_ctx == NULL) {
        printf("new_sd_ctx_t failed\n");
@@ -664,7 +708,10 @@ int main(int argc, const char* argv[]) {
                          params.seed,
                          params.batch_count,
                          control_image,
-                          params.control_strength);
+                          params.control_strength,
+                          params.style_ratio,
+                          params.normalize_input,
+                          params.input_id_images_path.c_str());
    } else {
        sd_image_t input_image = {(uint32_t)params.width,
                                  (uint32_t)params.height,