feat: add TencentARC PhotoMaker support (#179)

* first efforts at implementing photomaker; lots more to do

* added PhotoMakerIDEncoder model in SD

* fixed soem bugs; now photomaker model weights can be loaded into their tensor buffers

* added input id image loading

* added preprocessing inpit id images

* finished get_num_tensors

* fixed a bug in remove_duplicates

* add a get_learned_condition_with_trigger function to do photomaker stuff

* add a convert_token_to_id function for photomaker to extract trigger word's token id

* making progress; need to implement tokenizer decoder

* making more progress; finishing vision model forward

* debugging vision_model outputs

* corrected clip vision model output

* continue making progress in id fusion process

* finished stacked id embedding; to be tested

* remove garbage file

* debuging graph compute

* more progress; now alloc buffer failed

* fixed wtype issue; input images can only be 1 because issue with transformer when batch size > 1 (to be investigated)

* added delayed subject conditioning; now photomaker runs and generates images

* fixed stat_merge_step

* added photomaker lora model (to be tested)

* reworked pmid lora

* finished applying pmid lora; to be tested

* finalized pmid lora

* add a few print tensor; tweak in sample again

* small tweak; still not getting ID faces

* fixed a bug in FuseBlock forward; also remove diag_mask op in for vision transformer; getting better results

* disable pmid lora apply for now; 1 input image seems working; > 1 not working

* turn pmid lora apply back on

* fixed a decode bug

* fixed a bug in ggml's conv_2d, and now > 1 input images working

* add style_ratio as a cli param; reworked encode with trigger for attention weights

* merge commit fixing lora free param buffer error

* change default style ratio to 10%

* added an option to offload vae decoder to CPU for mem-limited gpus

* removing image normalization step seems making ID fidelity much higher

* revert default style ratio back ro 20%

* added an option for normalizing input ID images; cleaned up debugging code

* more clean up

* fixed bugs; now failed with cuda error; likely out-of-mem on GPU

* free pmid model params when required

* photomaker working properly now after merging and adapting to GGMLBlock API

* remove tensor renaming;  fixing names in the photomaker model file

* updated README.md to include instructions and notes for running PhotoMaker

* a bit clean up

* remove -DGGML_CUDA_FORCE_MMQ; more clean up and README update

* add input image requirement in README

* bring back freeing pmid lora params buffer; simply pooled output of CLIPvision

* remove MultiheadAttention2; customized MultiheadAttention

* added a WIN32 get_files_from_dir; turn off Photomakder if receiving no input images

* update docs

* fix ci error

* make stable-diffusion.h a pure c header file

This reverts commit 27887b630db6a92f269f0aef8de9bc9832ab50a9.

* fix ci error

* format code

* reuse get_learned_condition

* reuse pad_tokens

* reuse CLIPVisionModel

* reuse LoraModel

* add --clip-on-cpu

* fix lora name conversion for SDXL

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: leejet <leejet714@gmail.com>
This commit is contained in:
bssrdf
2024-03-12 11:15:17 -04:00
committed by GitHub
parent 61980171a1
commit a469688e30
28 changed files with 3935 additions and 186 deletions

View File

@@ -10,6 +10,7 @@
#include "stable-diffusion.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_STATIC
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
@@ -65,6 +66,8 @@ struct SDParams {
std::string esrgan_path;
std::string controlnet_path;
std::string embeddings_path;
std::string stacked_id_embeddings_path;
std::string input_id_images_path;
sd_type_t wtype = SD_TYPE_COUNT;
std::string lora_model_dir;
std::string output_path = "output.png";
@@ -73,12 +76,13 @@ struct SDParams {
std::string prompt;
std::string negative_prompt;
float min_cfg = 1.0f;
float cfg_scale = 7.0f;
int clip_skip = -1; // <= 0 represents unspecified
int width = 512;
int height = 512;
int batch_count = 1;
float min_cfg = 1.0f;
float cfg_scale = 7.0f;
float style_ratio = 20.f;
int clip_skip = -1; // <= 0 represents unspecified
int width = 512;
int height = 512;
int batch_count = 1;
int video_frames = 6;
int motion_bucket_id = 127;
@@ -95,6 +99,9 @@ struct SDParams {
bool verbose = false;
bool vae_tiling = false;
bool control_net_cpu = false;
bool normalize_input = false;
bool clip_on_cpu = false;
bool vae_on_cpu = false;
bool canny_preprocess = false;
int upscale_repeats = 1;
};
@@ -110,10 +117,16 @@ void print_params(SDParams params) {
printf(" esrgan_path: %s\n", params.esrgan_path.c_str());
printf(" controlnet_path: %s\n", params.controlnet_path.c_str());
printf(" embeddings_path: %s\n", params.embeddings_path.c_str());
printf(" stacked_id_embeddings_path: %s\n", params.stacked_id_embeddings_path.c_str());
printf(" input_id_images_path: %s\n", params.input_id_images_path.c_str());
printf(" style ratio: %.2f\n", params.style_ratio);
printf(" normzalize input image : %s\n", params.normalize_input ? "true" : "false");
printf(" output_path: %s\n", params.output_path.c_str());
printf(" init_img: %s\n", params.input_path.c_str());
printf(" control_image: %s\n", params.control_image_path.c_str());
printf(" clip on cpu: %s\n", params.clip_on_cpu ? "true" : "false");
printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false");
printf(" vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
printf(" strength(control): %.2f\n", params.control_strength);
printf(" prompt: %s\n", params.prompt.c_str());
printf(" negative_prompt: %s\n", params.negative_prompt.c_str());
@@ -146,6 +159,9 @@ void print_usage(int argc, const char* argv[]) {
printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
printf(" --control-net [CONTROL_PATH] path to control net model\n");
printf(" --embd-dir [EMBEDDING_PATH] path to embeddings.\n");
printf(" --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings.\n");
printf(" --input-id-images-dir [DIR] path to PHOTOMAKER input id images dir.\n");
printf(" --normalize-input normalize PHOTOMAKER input id images\n");
printf(" --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n");
printf(" --upscale-repeats Run the ESRGAN upscaler this many times (default 1)\n");
printf(" --type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
@@ -158,6 +174,7 @@ void print_usage(int argc, const char* argv[]) {
printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20%%)\n");
printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n");
printf(" 1.0 corresponds to full destruction of information in init image\n");
printf(" -H, --height H image height, in pixel space (default: 512)\n");
@@ -244,6 +261,18 @@ void parse_args(int argc, const char** argv, SDParams& params) {
break;
}
params.embeddings_path = argv[i];
} else if (arg == "--stacked-id-embd-dir") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.stacked_id_embeddings_path = argv[i];
} else if (arg == "--input-id-images-dir") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.input_id_images_path = argv[i];
} else if (arg == "--type") {
if (++i >= argc) {
invalid_arg = true;
@@ -327,6 +356,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
break;
}
params.strength = std::stof(argv[i]);
} else if (arg == "--style-ratio") {
if (++i >= argc) {
invalid_arg = true;
break;
}
params.style_ratio = std::stof(argv[i]);
} else if (arg == "--control-strength") {
if (++i >= argc) {
invalid_arg = true;
@@ -361,6 +396,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
params.vae_tiling = true;
} else if (arg == "--control-net-cpu") {
params.control_net_cpu = true;
} else if (arg == "--normalize-input") {
params.normalize_input = true;
} else if (arg == "--clip-on-cpu") {
params.clip_on_cpu = true; // will slow down get_learned_condiotion but necessary for low MEM GPUs
} else if (arg == "--vae-on-cpu") {
params.vae_on_cpu = true; // will slow down latent decoding but necessary for low MEM GPUs
} else if (arg == "--canny") {
params.canny_preprocess = true;
} else if (arg == "-b" || arg == "--batch-count") {
@@ -613,6 +654,7 @@ int main(int argc, const char* argv[]) {
params.controlnet_path.c_str(),
params.lora_model_dir.c_str(),
params.embeddings_path.c_str(),
params.stacked_id_embeddings_path.c_str(),
vae_decode_only,
params.vae_tiling,
true,
@@ -620,7 +662,9 @@ int main(int argc, const char* argv[]) {
params.wtype,
params.rng_type,
params.schedule,
params.control_net_cpu);
params.clip_on_cpu,
params.control_net_cpu,
params.vae_on_cpu);
if (sd_ctx == NULL) {
printf("new_sd_ctx_t failed\n");
@@ -664,7 +708,10 @@ int main(int argc, const char* argv[]) {
params.seed,
params.batch_count,
control_image,
params.control_strength);
params.control_strength,
params.style_ratio,
params.normalize_input,
params.input_id_images_path.c_str());
} else {
sd_image_t input_image = {(uint32_t)params.width,
(uint32_t)params.height,