From e5a7aec252b71caaad2837a50a1411b7ea229ea3 Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 3 Sep 2023 19:24:07 +0800 Subject: [PATCH] feat: add CUDA RNG --- README.md | 3 +- examples/main.cpp | 24 ++++++++- rng.h | 35 ++++++++++++ rng_philox.h | 125 +++++++++++++++++++++++++++++++++++++++++++ stable-diffusion.cpp | 46 ++++++++-------- stable-diffusion.h | 10 +++- 6 files changed, 217 insertions(+), 26 deletions(-) create mode 100644 rng.h create mode 100644 rng_philox.h diff --git a/README.md b/README.md index 443d8c5..b61cb5f 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now) - Sampling method - `Euler A` +- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`) - Supported platforms - Linux - Mac OS @@ -35,8 +36,6 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in - [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d) - [ ] LoRA support - [ ] k-quants support -- [ ] Cross-platform reproducibility (perhaps ensuring consistency with the original SD) -- [ ] Adapting to more weight formats ## Usage diff --git a/examples/main.cpp b/examples/main.cpp index 6cae6ac..3a1817c 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -67,6 +67,11 @@ int32_t get_num_physical_cores() { return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; } +const char* rng_type_to_str[] = { + "std_default", + "cuda", +}; + struct Option { int n_threads = -1; std::string mode = TXT2IMG; @@ -81,6 +86,7 @@ struct Option { SampleMethod sample_method = EULAR_A; int sample_steps = 20; float strength = 0.75f; + RNGType rng_type = STD_DEFAULT_RNG; int seed = 42; bool verbose = false; @@ -99,6 +105,7 @@ struct Option { printf(" sample_method: %s\n", "eular a"); printf(" sample_steps: %d\n", sample_steps); printf(" strength: %.2f\n", strength); + printf(" rng: %s\n", rng_type_to_str[rng_type]); printf(" seed: %d\n", seed); } }; @@ -123,6 +130,7 @@ void print_usage(int argc, const char* argv[]) { printf(" -W, --width W image width, in pixel space (default: 512)\n"); printf(" --sample-method SAMPLE_METHOD sample method (default: \"eular a\")\n"); printf(" --steps STEPS number of sample steps (default: 20)\n"); + printf(" --rng {std_default, cuda} RNG (default: std_default)\n"); printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); printf(" -v, --verbose print extra info\n"); } @@ -206,6 +214,20 @@ void parse_args(int argc, const char* argv[], Option* opt) { break; } opt->sample_steps = std::stoi(argv[i]); + } else if (arg == "--rng") { + if (++i >= argc) { + invalid_arg = true; + break; + } + std::string rng_type_str = argv[i]; + if (rng_type_str == "std_default") { + opt->rng_type = STD_DEFAULT_RNG; + } else if (rng_type_str == "cuda") { + opt->rng_type = CUDA_RNG; + } else { + invalid_arg = true; + break; + } } else if (arg == "-s" || arg == "--seed") { if (++i >= argc) { invalid_arg = true; @@ -328,7 +350,7 @@ int main(int argc, const char* argv[]) { init_img.assign(img_data, img_data + (opt.w * opt.h * c)); } - StableDiffusion sd(opt.n_threads, vae_decode_only, true); + StableDiffusion sd(opt.n_threads, vae_decode_only, true, opt.rng_type); if (!sd.load_from_file(opt.model_path)) { return 1; } diff --git a/rng.h b/rng.h new file mode 100644 index 0000000..a3cb974 --- /dev/null +++ b/rng.h @@ -0,0 +1,35 @@ +#ifndef __RNG_H__ +#define __RNG_H__ + +#include +#include + +class RNG { + public: + virtual void manual_seed(uint32_t seed) = 0; + virtual std::vector randn(uint32_t n) = 0; +}; + +class STDDefaultRNG : public RNG { + private: + std::default_random_engine generator; + + public: + void manual_seed(uint32_t seed) { + generator.seed(seed); + } + + std::vector randn(uint32_t n) { + std::vector result; + float mean = 0.0; + float stddev = 1.0; + std::normal_distribution distribution(mean, stddev); + for (int i = 0; i < n; i++) { + float random_number = distribution(generator); + result.push_back(random_number); + } + return result; + } +}; + +#endif // __RNG_H__ \ No newline at end of file diff --git a/rng_philox.h b/rng_philox.h new file mode 100644 index 0000000..a159c9a --- /dev/null +++ b/rng_philox.h @@ -0,0 +1,125 @@ +#ifndef __RNG_PHILOX_H__ +#define __RNG_PHILOX_H__ + +#include +#include + +#include "rng.h" + +// RNG imitiating torch cuda randn on CPU. +// Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py +class PhiloxRNG : public RNG { + private: + uint64_t seed; + uint32_t offset; + + private: + std::vector philox_m = {0xD2511F53, 0xCD9E8D57}; + std::vector philox_w = {0x9E3779B9, 0xBB67AE85}; + float two_pow32_inv = 2.3283064e-10; + float two_pow32_inv_2pi = 2.3283064e-10 * 6.2831855; + + std::vector uint32(uint64_t x) { + std::vector result(2); + result[0] = static_cast(x & 0xFFFFFFFF); + result[1] = static_cast(x >> 32); + return result; + } + + std::vector> uint32(const std::vector& x) { + int N = x.size(); + std::vector> result(2, std::vector(N)); + + for (int i = 0; i < N; ++i) { + result[0][i] = static_cast(x[i] & 0xFFFFFFFF); + result[1][i] = static_cast(x[i] >> 32); + } + + return result; + } + + // A single round of the Philox 4x32 random number generator. + void philox4_round(std::vector>& counter, + const std::vector>& key) { + uint32_t N = counter[0].size(); + for (uint32_t i = 0; i < N; i++) { + std::vector v1 = uint32(static_cast(counter[0][i]) * static_cast(philox_m[0])); + std::vector v2 = uint32(static_cast(counter[2][i]) * static_cast(philox_m[1])); + + counter[0][i] = v2[1] ^ counter[1][i] ^ key[0][i]; + counter[1][i] = v2[0]; + counter[2][i] = v1[1] ^ counter[3][i] ^ key[1][i]; + counter[3][i] = v1[0]; + } + } + + // Generates 32-bit random numbers using the Philox 4x32 random number generator. + // Parameters: + // counter : A 4xN array of 32-bit integers representing the counter values (offset into generation). + // key : A 2xN array of 32-bit integers representing the key values (seed). + // rounds : The number of rounds to perform. + // Returns: + // std::vector>: A 4xN array of 32-bit integers containing the generated random numbers. + std::vector> philox4_32(std::vector>& counter, + std::vector>& key, + int rounds = 10) { + uint32_t N = counter[0].size(); + for (int i = 0; i < rounds - 1; ++i) { + philox4_round(counter, key); + + for (uint32_t j = 0; j < N; ++j) { + key[0][j] += philox_w[0]; + key[1][j] += philox_w[1]; + } + } + + philox4_round(counter, key); + return counter; + } + + float box_muller(float x, float y) { + float u = x * two_pow32_inv + two_pow32_inv / 2; + float v = y * two_pow32_inv_2pi + two_pow32_inv_2pi / 2; + + float s = sqrt(-2.0 * log(u)); + + float r1 = s * sin(v); + return r1; + } + + public: + PhiloxRNG(uint64_t seed = 0) { + this->seed = seed; + this->offset = 0; + } + + void manual_seed(uint32_t seed) { + this->seed = seed; + this->offset = 0; + } + + std::vector randn(uint32_t n) { + std::vector> counter(4, std::vector(n, 0)); + for (uint32_t i = 0; i < n; i++) { + counter[0][i] = this->offset; + } + + for (uint32_t i = 0; i < n; i++) { + counter[2][i] = i; + } + this->offset += 1; + + std::vector key(n, this->seed); + std::vector> key_uint32 = uint32(key); + + std::vector> g = philox4_32(counter, key_uint32); + + std::vector result; + for (int i = 0; i < n; ++i) { + result.push_back(box_muller(g[0][i], g[1][i])); + } + return result; + } +}; + +#endif // __RNG_PHILOX_H__ \ No newline at end of file diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 48dd429..3eb0972 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -15,6 +15,8 @@ #include "ggml/ggml.h" #include "stable-diffusion.h" +#include "rng.h" +#include "rng_philox.h" static SDLogLevel log_level = SDLogLevel::INFO; @@ -117,19 +119,11 @@ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_pa return tensor; } -static std::default_random_engine generator; - -void set_random_seed(int seed) { - generator.seed(seed); -} - -void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor) { - float mean = 0.0; - float stddev = 1.0; - std::normal_distribution distribution(mean, stddev); - for (int i = 0; i < ggml_nelements(tensor); i++) { - float random_number = distribution(generator); - ggml_set_f32_1d(tensor, i, random_number); +void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr rng) { + uint32_t n = ggml_nelements(tensor); + std::vector random_numbers = rng->randn(n); + for (int i = 0; i < n; i++) { + ggml_set_f32_1d(tensor, i, random_numbers[i]); } } @@ -2747,6 +2741,8 @@ class StableDiffusionGGML { bool dynamic = true; bool vae_decode_only = false; bool free_params_immediately = false; + + std::shared_ptr rng = std::make_shared(); int32_t ftype = 1; int n_threads = -1; float scale_factor = 0.18215f; @@ -2765,11 +2761,17 @@ class StableDiffusionGGML { StableDiffusionGGML(int n_threads, bool vae_decode_only, - bool free_params_immediately) + bool free_params_immediately, + RNGType rng_type) : n_threads(n_threads), vae_decode_only(vae_decode_only), free_params_immediately(free_params_immediately) { first_stage_model.decode_only = vae_decode_only; + if (rng_type == STD_DEFAULT_RNG) { + rng = std::make_shared(); + } else if (rng_type == CUDA_RNG) { + rng = std::make_shared(); + } } ~StableDiffusionGGML() { @@ -3539,7 +3541,7 @@ class StableDiffusionGGML { if (sigmas[i + 1] > 0) { // x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up - ggml_tensor_set_f32_randn(noise); + ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); { float* vec_x = (float*)x->data; @@ -3674,7 +3676,7 @@ class StableDiffusionGGML { ggml_tensor* latent = ggml_new_tensor_4d(res_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); struct ggml_tensor* noise = ggml_dup_tensor(res_ctx, latent); - ggml_tensor_set_f32_randn(noise); + ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(res_ctx, "noise.bin"); { float mean = 0; @@ -3802,10 +3804,12 @@ class StableDiffusionGGML { StableDiffusion::StableDiffusion(int n_threads, bool vae_decode_only, - bool free_params_immediately) { + bool free_params_immediately, + RNGType rng_type) { sd = std::make_shared(n_threads, vae_decode_only, - free_params_immediately); + free_params_immediately, + rng_type); } bool StableDiffusion::load_from_file(const std::string& file_path) { @@ -3835,7 +3839,7 @@ std::vector StableDiffusion::txt2img(const std::string& prompt, if (seed < 0) { seed = (int)time(NULL); } - set_random_seed(seed); + sd->rng->manual_seed(seed); int64_t t0 = ggml_time_ms(); ggml_tensor* c = sd->get_learned_condition(ctx, prompt); @@ -3856,7 +3860,7 @@ std::vector StableDiffusion::txt2img(const std::string& prompt, int W = width / 8; int H = height / 8; struct ggml_tensor* x_t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, W, H, C, 1); - ggml_tensor_set_f32_randn(x_t); + ggml_tensor_set_f32_randn(x_t, sd->rng); std::vector sigmas = sd->denoiser->get_sigmas(sample_steps); @@ -3935,7 +3939,7 @@ std::vector StableDiffusion::img2img(const std::vector& init_i if (seed < 0) { seed = (int)time(NULL); } - set_random_seed(seed); + sd->rng->manual_seed(seed); ggml_tensor* init_img = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, width, height, 3, 1); image_vec_to_ggml(init_img_vec, init_img); diff --git a/stable-diffusion.h b/stable-diffusion.h index 730a655..11c892f 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -4,13 +4,18 @@ #include #include -enum class SDLogLevel { +enum SDLogLevel { DEBUG, INFO, WARN, ERROR }; +enum RNGType { + STD_DEFAULT_RNG, + CUDA_RNG +}; + enum SampleMethod { EULAR_A, }; @@ -24,7 +29,8 @@ class StableDiffusion { public: StableDiffusion(int n_threads = -1, bool vae_decode_only = false, - bool free_params_immediately = false); + bool free_params_immediately = false, + RNGType rng_type = STD_DEFAULT_RNG); bool load_from_file(const std::string& file_path); std::vector txt2img( const std::string& prompt,