From b6899e8fc200ff9e2a122751335d90836821e5fe Mon Sep 17 00:00:00 2001 From: Urs Ganse Date: Fri, 8 Sep 2023 18:47:28 +0300 Subject: [PATCH] feat: add Euler, Heun and DPM++ (2M) samplers (#50) * Add Euler sampler * Add Heun sampler * Add DPM++ (2M) sampler * Add modified DPM++ (2M) "v2" sampler. This was proposed in a issue discussion of the stable diffusion webui, at https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 and apparently works around overstepping of the DPM++ (2M) method with small step counts. The parameter is called dpmpp2mv2 here. * match code style --------- Co-authored-by: Urs Ganse Co-authored-by: leejet --- README.md | 8 +- examples/main.cpp | 32 ++++- stable-diffusion.cpp | 290 +++++++++++++++++++++++++++++++++++-------- stable-diffusion.h | 7 +- 4 files changed, 280 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index b61cb5f..170405a 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,10 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now) - Sampling method - `Euler A` + - `Euler` + - `Heun` + - `DPM++ 2M` + - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457) - Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`) - Supported platforms - Linux @@ -125,8 +129,10 @@ arguments: 1.0 corresponds to full destruction of information in init image -H, --height H image height, in pixel space (default: 512) -W, --width W image width, in pixel space (default: 512) - --sample-method SAMPLE_METHOD sample method (default: "eular a") + --sampling-method {euler, euler_a, heun, dpm++2m, dpm++2mv2} + sampling method (default: "euler_a") --steps STEPS number of sample steps (default: 20) + --rng {std_default, cuda} RNG (default: cuda) -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0) -v, --verbose print extra info ``` diff --git a/examples/main.cpp b/examples/main.cpp index 6cc4e68..b2a1ddf 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -72,6 +72,14 @@ const char* rng_type_to_str[] = { "cuda", }; +// Names of the sampler method, same order as enum SampleMethod in stable-diffusion.h +const char* sample_method_str[] = { + "euler_a", + "euler", + "heun", + "dpm++2m", + "dpm++2mv2"}; + struct Option { int n_threads = -1; std::string mode = TXT2IMG; @@ -83,7 +91,7 @@ struct Option { float cfg_scale = 7.0f; int w = 512; int h = 512; - SampleMethod sample_method = EULAR_A; + SampleMethod sample_method = EULER_A; int sample_steps = 20; float strength = 0.75f; RNGType rng_type = CUDA_RNG; @@ -102,7 +110,7 @@ struct Option { printf(" cfg_scale: %.2f\n", cfg_scale); printf(" width: %d\n", w); printf(" height: %d\n", h); - printf(" sample_method: %s\n", "eular a"); + printf(" sample_method: %s\n", sample_method_str[sample_method]); printf(" sample_steps: %d\n", sample_steps); printf(" strength: %.2f\n", strength); printf(" rng: %s\n", rng_type_to_str[rng_type]); @@ -128,7 +136,8 @@ void print_usage(int argc, const char* argv[]) { printf(" 1.0 corresponds to full destruction of information in init image\n"); printf(" -H, --height H image height, in pixel space (default: 512)\n"); printf(" -W, --width W image width, in pixel space (default: 512)\n"); - printf(" --sample-method SAMPLE_METHOD sample method (default: \"eular a\")\n"); + printf(" --sampling-method {euler, euler_a, heun, dpm++2m, dpm++2mv2}\n"); + printf(" sampling method (default: \"euler_a\")\n"); printf(" --steps STEPS number of sample steps (default: 20)\n"); printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); @@ -234,6 +243,23 @@ void parse_args(int argc, const char* argv[], Option* opt) { break; } opt->seed = std::stoll(argv[i]); + } else if (arg == "--sampling-method") { + if (++i >= argc) { + invalid_arg = true; + break; + } + const char* sample_method_selected = argv[i]; + int sample_method_found = -1; + for (int m = 0; m < N_SAMPLE_METHODS; m++) { + if (!strcmp(sample_method_selected, sample_method_str[m])) { + sample_method_found = m; + } + } + if (sample_method_found == -1) { + invalid_arg = true; + break; + } + opt->sample_method = (SampleMethod)sample_method_found; } else if (arg == "-h" || arg == "--help") { print_usage(argc, argv); exit(0); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 954cef7..9263ce7 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -3502,69 +3502,255 @@ class StableDiffusionGGML { ggml_graph_print(&diffusion_graph); #endif int64_t t1 = ggml_time_ms(); - LOG_INFO("step %d sampling completed, taking %.2fs", step, (t1 - t0) * 1.0f / 1000); - LOG_DEBUG("diffusion graph use %.2fMB runtime memory: static %.2fMB, dynamic %.2fMB", - (ctx_size + ggml_curr_max_dynamic_size()) * 1.0f / 1024 / 1024, - ctx_size * 1.0f / 1024 / 1024, - ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024); - LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size()); + if (step > 0) { + LOG_INFO("step %d sampling completed, taking %.2fs", step, (t1 - t0) * 1.0f / 1000); + LOG_DEBUG("diffusion graph use %.2fMB runtime memory: static %.2fMB, dynamic %.2fMB", + (ctx_size + ggml_curr_max_dynamic_size()) * 1.0f / 1024 / 1024, + ctx_size * 1.0f / 1024 / 1024, + ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024); + LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size()); + } }; // sample_euler_ancestral - { - ggml_set_dynamic(ctx, false); - struct ggml_tensor* noise = ggml_dup_tensor(ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(ctx, x); - ggml_set_dynamic(ctx, params.dynamic); + switch (method) { + case EULER_A: { + LOG_INFO("sampling using Euler A method"); + ggml_set_dynamic(ctx, false); + struct ggml_tensor* noise = ggml_dup_tensor(ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(ctx, x); + ggml_set_dynamic(ctx, params.dynamic); - for (int i = 0; i < steps; i++) { - float sigma = sigmas[i]; + for (int i = 0; i < steps; i++) { + float sigma = sigmas[i]; - // denoise - denoise(x, sigma, i + 1); + // denoise + denoise(x, sigma, i + 1); - // d = (x - denoised) / sigma - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; - - for (int i = 0; i < ggml_nelements(d); i++) { - vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma; - } - } - - // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); - float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); - - // Euler method - float dt = sigma_down - sigmas[i]; - // x = x + d * dt - { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - - for (int i = 0; i < ggml_nelements(x); i++) { - vec_x[i] = vec_x[i] + vec_d[i] * dt; - } - } - - if (sigmas[i + 1] > 0) { - // x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up - ggml_tensor_set_f32_randn(noise, rng); - // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); + // d = (x - denoised) / sigma { + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; + + for (int i = 0; i < ggml_nelements(d); i++) { + vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma; + } + } + + // get_ancestral_step + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); + float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); + + // Euler method + float dt = sigma_down - sigmas[i]; + // x = x + d * dt + { + float* vec_d = (float*)d->data; float* vec_x = (float*)x->data; - float* vec_noise = (float*)noise->data; for (int i = 0; i < ggml_nelements(x); i++) { - vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; + vec_x[i] = vec_x[i] + vec_d[i] * dt; + } + } + + if (sigmas[i + 1] > 0) { + // x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up + ggml_tensor_set_f32_randn(noise, rng); + // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); + { + float* vec_x = (float*)x->data; + float* vec_noise = (float*)noise->data; + + for (int i = 0; i < ggml_nelements(x); i++) { + vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up; + } } } } - } + } break; + case EULER: // Implemented without any sigma churn + { + LOG_INFO("sampling using Euler method"); + ggml_set_dynamic(ctx, false); + struct ggml_tensor* d = ggml_dup_tensor(ctx, x); + ggml_set_dynamic(ctx, params.dynamic); + + for (int i = 0; i < steps; i++) { + float sigma = sigmas[i]; + + // denoise + denoise(x, sigma, i + 1); + + // d = (x - denoised) / sigma + { + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; + + for (int j = 0; j < ggml_nelements(d); j++) { + vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma; + } + } + + float dt = sigmas[i + 1] - sigma; + // x = x + d * dt + { + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + + for (int j = 0; j < ggml_nelements(x); j++) { + vec_x[j] = vec_x[j] + vec_d[j] * dt; + } + } + } + } break; + case HEUN: { + LOG_INFO("sampling using Heun method"); + ggml_set_dynamic(ctx, false); + struct ggml_tensor* d = ggml_dup_tensor(ctx, x); + struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x); + ggml_set_dynamic(ctx, params.dynamic); + + for (int i = 0; i < steps; i++) { + // denoise + denoise(x, sigmas[i], -(i + 1)); + + // d = (x - denoised) / sigma + { + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; + + for (int j = 0; j < ggml_nelements(x); j++) { + vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i]; + } + } + + float dt = sigmas[i + 1] - sigmas[i]; + if (sigmas[i + 1] == 0) { + // Euler step + // x = x + d * dt + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + + for (int j = 0; j < ggml_nelements(x); j++) { + vec_x[j] = vec_x[j] + vec_d[j] * dt; + } + } else { + // Heun step + float* vec_d = (float*)d->data; + float* vec_d2 = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_x2 = (float*)x2->data; + + for (int j = 0; j < ggml_nelements(x); j++) { + vec_x2[j] = vec_x[j] + vec_d[j] * dt; + } + + denoise(x2, sigmas[i + 1], i + 1); + float* vec_denoised = (float*)denoised->data; + for (int j = 0; j < ggml_nelements(x); j++) { + float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1]; + vec_d[j] = (vec_d[j] + d2) / 2; + vec_x[j] = vec_x[j] + vec_d[j] * dt; + } + } + } + } break; + case DPMPP2M: // DPM++ (2M) from Karras et al (2022) + { + LOG_INFO("sampling using DPM++ (2M) method"); + ggml_set_dynamic(ctx, false); + struct ggml_tensor* old_denoised = ggml_dup_tensor(ctx, x); + ggml_set_dynamic(ctx, params.dynamic); + + auto t_fn = [](float sigma) -> float { return -log(sigma); }; + + for (int i = 0; i < steps; i++) { + // denoise + denoise(x, sigmas[i], i + 1); + + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float b = exp(-h) - 1.; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; + float* vec_old_denoised = (float*)old_denoised->data; + + if (i == 0 || sigmas[i + 1] == 0) { + // Simpler step for the edge cases + for (int j = 0; j < ggml_nelements(x); j++) { + vec_x[j] = a * vec_x[j] - b * vec_denoised[j]; + } + } else { + float h_last = t - t_fn(sigmas[i - 1]); + float r = h_last / h; + for (int j = 0; j < ggml_nelements(x); j++) { + float denoised_d = (1. + 1. / (2. * r)) * vec_denoised[j] - (1. / (2. * r)) * vec_old_denoised[j]; + vec_x[j] = a * vec_x[j] - b * denoised_d; + } + } + + // old_denoised = denoised + for (int j = 0; j < ggml_nelements(x); j++) { + vec_old_denoised[j] = vec_denoised[j]; + } + } + } break; + case DPMPP2Mv2: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457 + { + LOG_INFO("sampling using modified DPM++ (2M) method"); + ggml_set_dynamic(ctx, false); + struct ggml_tensor* old_denoised = ggml_dup_tensor(ctx, x); + ggml_set_dynamic(ctx, params.dynamic); + + auto t_fn = [](float sigma) -> float { return -log(sigma); }; + + for (int i = 0; i < steps; i++) { + // denoise + denoise(x, sigmas[i], i + 1); + + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; + float* vec_old_denoised = (float*)old_denoised->data; + + if (i == 0 || sigmas[i + 1] == 0) { + // Simpler step for the edge cases + float b = exp(-h) - 1.; + for (int j = 0; j < ggml_nelements(x); j++) { + vec_x[j] = a * vec_x[j] - b * vec_denoised[j]; + } + } else { + float h_last = t - t_fn(sigmas[i - 1]); + float h_min = std::min(h_last, h); + float h_max = std::max(h_last, h); + float r = h_max / h_min; + float h_d = (h_max + h_min) / 2.; + float b = exp(-h_d) - 1.; + for (int j = 0; j < ggml_nelements(x); j++) { + float denoised_d = (1. + 1. / (2. * r)) * vec_denoised[j] - (1. / (2. * r)) * vec_old_denoised[j]; + vec_x[j] = a * vec_x[j] - b * denoised_d; + } + } + + // old_denoised = denoised + for (int j = 0; j < ggml_nelements(x); j++) { + vec_old_denoised[j] = vec_denoised[j]; + } + } + } break; + + default: + LOG_ERROR("Attempting to sample with nonexisting sample method %i", method); + abort(); } size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size(); @@ -3599,7 +3785,7 @@ class StableDiffusionGGML { struct ggml_tensor* result = NULL; // calculate the amount of memory required - size_t ctx_size = 10 * 1024 * 1024; // 10MB + size_t ctx_size = 10 * 1024 * 1024; // 10MB { struct ggml_init_params params; params.mem_size = ctx_size; @@ -3728,7 +3914,7 @@ class StableDiffusionGGML { } // calculate the amount of memory required - size_t ctx_size = 10 * 1024 * 1024; // 10MB + size_t ctx_size = 10 * 1024 * 1024; // 10MB { struct ggml_init_params params; params.mem_size = ctx_size; diff --git a/stable-diffusion.h b/stable-diffusion.h index 4112265..8118633 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -17,7 +17,12 @@ enum RNGType { }; enum SampleMethod { - EULAR_A, + EULER_A, + EULER, + HEUN, + DPMPP2M, + DPMPP2Mv2, + N_SAMPLE_METHODS }; class StableDiffusionGGML;