diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..58d1885 --- /dev/null +++ b/.clang-format @@ -0,0 +1,13 @@ +BasedOnStyle: Chromium +UseTab: Never +IndentWidth: 4 +TabWidth: 4 +AllowShortIfStatementsOnASingleLine: false +IndentCaseLabels: false +ColumnLimit: 0 +AccessModifierOffset: -4 +NamespaceIndentation: All +FixNamespaceComments: false +AlignAfterOpenBracket: true +AlignConsecutiveAssignments: true +IndentCaseLabels: true \ No newline at end of file diff --git a/.gitignore b/.gitignore index 59a8a2c..fd8ae08 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ test/ .cache/ *.swp +.vscode/ diff --git a/examples/main.cpp b/examples/main.cpp index b97035a..f4d6ca8 100644 --- a/examples/main.cpp +++ b/examples/main.cpp @@ -88,11 +88,10 @@ const char* sample_method_str[] = { const char* schedule_str[] = { "default", "discrete", - "karras" -}; + "karras"}; struct Option { - int n_threads = -1; + int n_threads = -1; std::string mode = TXT2IMG; std::string model_path; std::string lora_model_dir; @@ -100,16 +99,16 @@ struct Option { std::string init_img; std::string prompt; std::string negative_prompt; - float cfg_scale = 7.0f; - int w = 512; - int h = 512; + float cfg_scale = 7.0f; + int w = 512; + int h = 512; SampleMethod sample_method = EULER_A; - Schedule schedule = DEFAULT; - int sample_steps = 20; - float strength = 0.75f; - RNGType rng_type = CUDA_RNG; - int64_t seed = 42; - bool verbose = false; + Schedule schedule = DEFAULT; + int sample_steps = 20; + float strength = 0.75f; + RNGType rng_type = CUDA_RNG; + int64_t seed = 42; + bool verbose = false; void print() { printf("Option: \n"); @@ -129,7 +128,7 @@ struct Option { printf(" sample_steps: %d\n", sample_steps); printf(" strength: %.2f\n", strength); printf(" rng: %s\n", rng_type_to_str[rng_type]); - printf(" seed: %lld\n", seed); + printf(" seed: %ld\n", seed); } }; @@ -266,7 +265,7 @@ void parse_args(int argc, const char* argv[], Option* opt) { break; } const char* schedule_selected = argv[i]; - int schedule_found = -1; + int schedule_found = -1; for (int d = 0; d < N_SCHEDULES; d++) { if (!strcmp(schedule_selected, schedule_str[d])) { schedule_found = d; @@ -289,7 +288,7 @@ void parse_args(int argc, const char* argv[], Option* opt) { break; } const char* sample_method_selected = argv[i]; - int sample_method_found = -1; + int sample_method_found = -1; for (int m = 0; m < N_SAMPLE_METHODS; m++) { if (!strcmp(sample_method_selected, sample_method_str[m])) { sample_method_found = m; @@ -405,6 +404,7 @@ int main(int argc, const char* argv[]) { vae_decode_only = false; int c = 0; + unsigned char* img_data = stbi_load(opt.init_img.c_str(), &opt.w, &opt.h, &c, 3); if (img_data == NULL) { fprintf(stderr, "load image from '%s' failed\n", opt.init_img.c_str()); diff --git a/rng.h b/rng.h index 73d998b..3340be6 100644 --- a/rng.h +++ b/rng.h @@ -5,23 +5,23 @@ #include class RNG { - public: - virtual void manual_seed(uint64_t seed) = 0; +public: + virtual void manual_seed(uint64_t seed) = 0; virtual std::vector randn(uint32_t n) = 0; }; class STDDefaultRNG : public RNG { - private: +private: std::default_random_engine generator; - public: +public: void manual_seed(uint64_t seed) { generator.seed((unsigned int)seed); } std::vector randn(uint32_t n) { std::vector result; - float mean = 0.0; + float mean = 0.0; float stddev = 1.0; std::normal_distribution distribution(mean, stddev); for (uint32_t i = 0; i < n; i++) { diff --git a/rng_philox.h b/rng_philox.h index e334768..5672b5a 100644 --- a/rng_philox.h +++ b/rng_philox.h @@ -9,15 +9,15 @@ // RNG imitiating torch cuda randn on CPU. // Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py class PhiloxRNG : public RNG { - private: +private: uint64_t seed; uint32_t offset; - private: +private: std::vector philox_m = {0xD2511F53, 0xCD9E8D57}; std::vector philox_w = {0x9E3779B9, 0xBB67AE85}; - float two_pow32_inv = 2.3283064e-10f; - float two_pow32_inv_2pi = 2.3283064e-10f * 6.2831855f; + float two_pow32_inv = 2.3283064e-10f; + float two_pow32_inv_2pi = 2.3283064e-10f * 6.2831855f; std::vector uint32(uint64_t x) { std::vector result(2); @@ -87,14 +87,14 @@ class PhiloxRNG : public RNG { return r1; } - public: +public: PhiloxRNG(uint64_t seed = 0) { - this->seed = seed; + this->seed = seed; this->offset = 0; } void manual_seed(uint64_t seed) { - this->seed = seed; + this->seed = seed; this->offset = 0; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 8339501..1a0b05d 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -108,7 +108,7 @@ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_pa } int32_t nelements = 1; - int32_t ne[4] = {1, 1, 1, 1}; + int32_t ne[4] = {1, 1, 1, 1}; for (int i = 0; i < n_dims; ++i) { file.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); nelements *= ne[i]; @@ -116,13 +116,13 @@ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_pa std::string name(length, 0); file.read(&name[0], length); ggml_tensor* tensor = ggml_new_tensor_4d(ctx, (ggml_type)ttype, ne[0], ne[1], ne[2], ne[3]); - const size_t bpe = ggml_type_size(ggml_type(ttype)); + const size_t bpe = ggml_type_size(ggml_type(ttype)); file.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); return tensor; } void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr rng) { - uint32_t n = (uint32_t)ggml_nelements(tensor); + uint32_t n = (uint32_t)ggml_nelements(tensor); std::vector random_numbers = rng->randn(n); for (uint32_t i = 0; i < n; i++) { ggml_set_f32_1d(tensor, i, random_numbers[i]); @@ -231,7 +231,7 @@ std::vector ggml_to_image_vec(struct ggml_tensor* t) { for (int j = 0; j < w; j++) { for (int k = 0; k < c; k++) { float value = ggml_tensor_get_f32(t, j, i, k); - value = (value + 1.0f) * 0.5f; + value = (value + 1.0f) * 0.5f; if (value < 0) { value = 0; } else if (value > 1) { @@ -247,16 +247,16 @@ std::vector ggml_to_image_vec(struct ggml_tensor* t) { void image_vec_to_ggml(const std::vector& vec, struct ggml_tensor* t) { - int64_t w = t->ne[0]; - int64_t h = t->ne[1]; - int64_t c = t->ne[2]; + int64_t w = t->ne[0]; + int64_t h = t->ne[1]; + int64_t c = t->ne[2]; uint8_t* data = (uint8_t*)vec.data(); for (int i = 0; i < h; i++) { for (int j = 0; j < w; j++) { for (int k = 0; k < c; k++) { float value = *(data + i * w * c + j * c + k); - value = value / 255.f; - value = 2 * value - 1; + value = value / 255.f; + value = 2 * value - 1; ggml_tensor_set_f32(t, value, j, i, k); } } @@ -275,7 +275,7 @@ std::pair, std::string> extract_and_remov while (std::regex_search(text, matches, re)) { std::string filename = matches[1].str(); - float multiplier = std::stof(matches[2].str()); + float multiplier = std::stof(matches[2].str()); if (multiplier < 0.f) { continue; } @@ -322,14 +322,14 @@ const int PAD_TOKEN_ID = 49407; // Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py // TODO: implement bpe class CLIPTokenizer { - private: +private: ModelType model_type = SD1; std::map encoder; std::regex pat; static std::string strip(const std::string& str) { std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f"); - std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f"); + std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f"); if (start == std::string::npos) { // String contains only whitespace characters @@ -345,7 +345,7 @@ class CLIPTokenizer { return text; } - public: +public: CLIPTokenizer(ModelType model_type = SD1) : model_type(model_type){}; std::string bpe(std::string token) { @@ -457,7 +457,7 @@ std::vector> parse_prompt_attention(const std::str std::vector round_brackets; std::vector square_brackets; - float round_bracket_multiplier = 1.1f; + float round_bracket_multiplier = 1.1f; float square_bracket_multiplier = 1 / 1.1f; std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)"); @@ -473,7 +473,7 @@ std::vector> parse_prompt_attention(const std::str std::string remaining_text = text; while (std::regex_search(remaining_text, m, re_attention)) { - std::string text = m[0]; + std::string text = m[0]; std::string weight = m[1]; if (text == "(") { @@ -595,32 +595,32 @@ struct ResidualAttentionBlock { } void map_by_name(std::map& tensors, const std::string prefix) { - tensors[prefix + "self_attn.q_proj.weight"] = q_w; - tensors[prefix + "self_attn.q_proj.bias"] = q_b; - tensors[prefix + "self_attn.k_proj.weight"] = k_w; - tensors[prefix + "self_attn.k_proj.bias"] = k_b; - tensors[prefix + "self_attn.v_proj.weight"] = v_w; - tensors[prefix + "self_attn.v_proj.bias"] = v_b; + tensors[prefix + "self_attn.q_proj.weight"] = q_w; + tensors[prefix + "self_attn.q_proj.bias"] = q_b; + tensors[prefix + "self_attn.k_proj.weight"] = k_w; + tensors[prefix + "self_attn.k_proj.bias"] = k_b; + tensors[prefix + "self_attn.v_proj.weight"] = v_w; + tensors[prefix + "self_attn.v_proj.bias"] = v_b; tensors[prefix + "self_attn.out_proj.weight"] = out_w; - tensors[prefix + "self_attn.out_proj.bias"] = out_b; + tensors[prefix + "self_attn.out_proj.bias"] = out_b; tensors[prefix + "layer_norm1.weight"] = ln1_w; - tensors[prefix + "layer_norm1.bias"] = ln1_b; + tensors[prefix + "layer_norm1.bias"] = ln1_b; tensors[prefix + "layer_norm2.weight"] = ln2_w; - tensors[prefix + "layer_norm2.bias"] = ln2_b; + tensors[prefix + "layer_norm2.bias"] = ln2_b; tensors[prefix + "mlp.fc1.weight"] = fc1_w; - tensors[prefix + "mlp.fc1.bias"] = fc1_b; + tensors[prefix + "mlp.fc1.bias"] = fc1_b; tensors[prefix + "mlp.fc2.weight"] = fc2_w; - tensors[prefix + "mlp.fc2.bias"] = fc2_b; + tensors[prefix + "mlp.fc2.bias"] = fc2_b; } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, n_token, hidden_size] - int64_t N = x->ne[2]; - int64_t n_token = x->ne[1]; + int64_t N = x->ne[2]; + int64_t n_token = x->ne[1]; int64_t hidden_size = n_head * d_model; struct ggml_tensor* r = x; @@ -637,24 +637,24 @@ struct ResidualAttentionBlock { struct ggml_tensor* q = ggml_add(ctx, ggml_repeat(ctx, q_b, x), ggml_mul_mat(ctx, q_w, x)); - q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_model))); - q = ggml_reshape_4d(ctx, q, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model] - q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, n_token, d_model] - q = ggml_reshape_3d(ctx, q, d_model, n_token, n_head * N); // [N * n_head, n_token, d_model] + q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_model))); + q = ggml_reshape_4d(ctx, q, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model] + q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, n_token, d_model] + q = ggml_reshape_3d(ctx, q, d_model, n_token, n_head * N); // [N * n_head, n_token, d_model] struct ggml_tensor* k = ggml_add(ctx, ggml_repeat(ctx, k_b, x), ggml_mul_mat(ctx, k_w, x)); - k = ggml_reshape_4d(ctx, k, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model] - k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, n_token, d_model] - k = ggml_reshape_3d(ctx, k, d_model, n_token, n_head); // [N * n_head, n_token, d_model] + k = ggml_reshape_4d(ctx, k, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model] + k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, n_token, d_model] + k = ggml_reshape_3d(ctx, k, d_model, n_token, n_head); // [N * n_head, n_token, d_model] struct ggml_tensor* v = ggml_add(ctx, ggml_repeat(ctx, v_b, x), ggml_mul_mat(ctx, v_w, x)); - v = ggml_reshape_4d(ctx, v, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model] - v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_model, n_token] - v = ggml_reshape_3d(ctx, v, n_token, d_model, n_head * N); // [N * n_head, d_model, n_token] + v = ggml_reshape_4d(ctx, v, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model] + v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_model, n_token] + v = ggml_reshape_3d(ctx, v, n_token, d_model, n_head * N); // [N * n_head, d_model, n_token] struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, n_token, n_token] @@ -662,8 +662,8 @@ struct ResidualAttentionBlock { kq = ggml_soft_max_inplace(ctx, kq); struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, n_token, d_model] - kqv = ggml_reshape_4d(ctx, kqv, d_model, n_token, n_head, N); - kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, n_token, n_head, d_model] + kqv = ggml_reshape_4d(ctx, kqv, d_model, n_token, n_head, N); + kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, n_token, n_head, d_model] x = ggml_reshape_2d(ctx, kqv, d_model * n_head, n_token * N); // // [N * n_token, d_model * n_head] } @@ -708,12 +708,12 @@ struct ResidualAttentionBlock { struct CLIPTextModel { ModelType model_type = SD1; // network hparams - int32_t vocab_size = 49408; + int32_t vocab_size = 49408; int32_t max_position_embeddings = 77; - int32_t hidden_size = 768; // 1024 for SD 2.x - int32_t intermediate_size = 3072; // 4096 for SD 2.x - int32_t n_head = 12; // num_attention_heads, 16 for SD 2.x - int32_t num_hidden_layers = 12; // 24 for SD 2.x + int32_t hidden_size = 768; // 1024 for SD 2.x + int32_t intermediate_size = 3072; // 4096 for SD 2.x + int32_t n_head = 12; // num_attention_heads, 16 for SD 2.x + int32_t num_hidden_layers = 12; // 24 for SD 2.x // embeddings struct ggml_tensor* position_ids; @@ -727,9 +727,9 @@ struct CLIPTextModel { CLIPTextModel(ModelType model_type = SD1) : model_type(model_type) { if (model_type == SD2) { - hidden_size = 1024; + hidden_size = 1024; intermediate_size = 4096; - n_head = 16; + n_head = 16; num_hidden_layers = 24; } resblocks.resize(num_hidden_layers); @@ -739,9 +739,9 @@ struct CLIPTextModel { void set_resblocks_hp_params() { int d_model = hidden_size / n_head; // 64 for (int i = 0; i < num_hidden_layers; i++) { - resblocks[i].d_model = d_model; - resblocks[i].n_head = n_head; - resblocks[i].hidden_size = hidden_size; + resblocks[i].d_model = d_model; + resblocks[i].n_head = n_head; + resblocks[i].hidden_size = hidden_size; resblocks[i].intermediate_size = intermediate_size; } } @@ -764,7 +764,7 @@ struct CLIPTextModel { for (int i = 0; i < max_position_embeddings; i++) { ggml_set_i32_1d(position_ids, i, i); } - token_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, vocab_size); + token_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, vocab_size); position_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, max_position_embeddings); for (int i = 0; i < num_hidden_layers; i++) { @@ -776,10 +776,10 @@ struct CLIPTextModel { } void map_by_name(std::map& tensors, const std::string prefix) { - tensors[prefix + "embeddings.token_embedding.weight"] = token_embed_weight; + tensors[prefix + "embeddings.token_embedding.weight"] = token_embed_weight; tensors[prefix + "embeddings.position_embedding.weight"] = position_embed_weight; - tensors[prefix + "final_layer_norm.weight"] = final_ln_w; - tensors[prefix + "final_layer_norm.bias"] = final_ln_b; + tensors[prefix + "final_layer_norm.weight"] = final_ln_w; + tensors[prefix + "final_layer_norm.bias"] = final_ln_b; for (int i = 0; i < num_hidden_layers; i++) { resblocks[i].map_by_name(tensors, prefix + "encoder.layers." + std::to_string(i) + "."); } @@ -822,7 +822,7 @@ struct FrozenCLIPEmbedder { CLIPTokenizer tokenizer; CLIPTextModel text_model; struct ggml_tensor* forward(struct ggml_context* ctx, const std::string& prompt) { - std::vector tokens = tokenizer.tokenize(prompt, text_model.max_position_embeddings, true); + std::vector tokens = tokenizer.tokenize(prompt, text_model.max_position_embeddings, true); struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens.size()); memcpy(input_ids->data, tokens.data(), tokens.size() * ggml_element_size(input_ids)); struct ggml_tensor* hidden_states = text_model.forward(ctx, input_ids); @@ -841,7 +841,7 @@ struct FrozenCLIPEmbedderWithCustomWords { std::pair, std::vector> tokenize(std::string text, size_t max_length = 0, - bool padding = false) { + bool padding = false) { auto parsed_attention = parse_prompt_attention(text); { @@ -858,7 +858,7 @@ struct FrozenCLIPEmbedderWithCustomWords { std::vector weights; for (const auto& item : parsed_attention) { const std::string& curr_text = item.first; - float curr_weight = item.second; + float curr_weight = item.second; std::vector curr_tokens = tokenizer.encode(curr_text); tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); weights.insert(weights.end(), curr_tokens.size(), curr_weight); @@ -969,21 +969,21 @@ struct ResBlock { void map_by_name(std::map& tensors, const std::string prefix) { tensors[prefix + "in_layers.0.weight"] = in_layer_0_w; - tensors[prefix + "in_layers.0.bias"] = in_layer_0_b; + tensors[prefix + "in_layers.0.bias"] = in_layer_0_b; tensors[prefix + "in_layers.2.weight"] = in_layer_2_w; - tensors[prefix + "in_layers.2.bias"] = in_layer_2_b; + tensors[prefix + "in_layers.2.bias"] = in_layer_2_b; tensors[prefix + "emb_layers.1.weight"] = emb_layer_1_w; - tensors[prefix + "emb_layers.1.bias"] = emb_layer_1_b; + tensors[prefix + "emb_layers.1.bias"] = emb_layer_1_b; tensors[prefix + "out_layers.0.weight"] = out_layer_0_w; - tensors[prefix + "out_layers.0.bias"] = out_layer_0_b; + tensors[prefix + "out_layers.0.bias"] = out_layer_0_b; tensors[prefix + "out_layers.3.weight"] = out_layer_3_w; - tensors[prefix + "out_layers.3.bias"] = out_layer_3_b; + tensors[prefix + "out_layers.3.bias"] = out_layer_3_b; if (out_channels != channels) { tensors[prefix + "skip_connection.weight"] = skip_w; - tensors[prefix + "skip_connection.bias"] = skip_b; + tensors[prefix + "skip_connection.bias"] = skip_b; } } @@ -994,15 +994,15 @@ struct ResBlock { // in_layers // group norm 32 auto h = ggml_group_norm_32(ctx, x); - h = ggml_add(ctx, - ggml_mul(ctx, - ggml_repeat(ctx, - ggml_reshape_4d(ctx, in_layer_0_w, 1, 1, in_layer_0_w->ne[0], 1), - h), - h), - ggml_repeat(ctx, - ggml_reshape_4d(ctx, in_layer_0_b, 1, 1, in_layer_0_b->ne[0], 1), - h)); + h = ggml_add(ctx, + ggml_mul(ctx, + ggml_repeat(ctx, + ggml_reshape_4d(ctx, in_layer_0_w, 1, 1, in_layer_0_w->ne[0], 1), + h), + h), + ggml_repeat(ctx, + ggml_reshape_4d(ctx, in_layer_0_b, 1, 1, in_layer_0_b->ne[0], 1), + h)); // silu h = ggml_silu_inplace(ctx, h); // conv2d @@ -1015,10 +1015,10 @@ struct ResBlock { // emb_layers auto emb_out = ggml_silu(ctx, emb); - emb_out = ggml_mul_mat(ctx, emb_layer_1_w, emb_out); - emb_out = ggml_add(ctx, ggml_repeat(ctx, emb_layer_1_b, emb_out), emb_out); // [N, out_channels] - emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1] - emb_out = ggml_repeat(ctx, emb_out, h); // [N, out_channels, h, w] + emb_out = ggml_mul_mat(ctx, emb_layer_1_w, emb_out); + emb_out = ggml_add(ctx, ggml_repeat(ctx, emb_layer_1_b, emb_out), emb_out); // [N, out_channels] + emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1] + emb_out = ggml_repeat(ctx, emb_out, h); // [N, out_channels, h, w] // out_layers h = ggml_add(ctx, h, emb_out); @@ -1056,7 +1056,7 @@ struct SpatialTransformer { int in_channels; // mult * model_channels int n_head; // num_heads int d_head; // in_channels // n_heads - int depth = 1; // 1 + int depth = 1; // 1 int context_dim = 768; // hidden_size, 1024 for SD2.x // group norm @@ -1131,8 +1131,8 @@ struct SpatialTransformer { } void init_params(struct ggml_context* ctx, ggml_type wtype) { - norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); - norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); + norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); + norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); proj_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels); proj_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); @@ -1171,43 +1171,43 @@ struct SpatialTransformer { } void map_by_name(std::map& tensors, const std::string prefix) { - tensors[prefix + "norm.weight"] = norm_w; - tensors[prefix + "norm.bias"] = norm_b; + tensors[prefix + "norm.weight"] = norm_w; + tensors[prefix + "norm.bias"] = norm_b; tensors[prefix + "proj_in.weight"] = proj_in_w; - tensors[prefix + "proj_in.bias"] = proj_in_b; + tensors[prefix + "proj_in.bias"] = proj_in_b; // transformer { - std::string transformer_prefix = prefix + "transformer_blocks.0."; + std::string transformer_prefix = prefix + "transformer_blocks.0."; tensors[transformer_prefix + "attn1.to_q.weight"] = transformer.attn1_q_w; tensors[transformer_prefix + "attn1.to_k.weight"] = transformer.attn1_k_w; tensors[transformer_prefix + "attn1.to_v.weight"] = transformer.attn1_v_w; tensors[transformer_prefix + "attn1.to_out.0.weight"] = transformer.attn1_out_w; - tensors[transformer_prefix + "attn1.to_out.0.bias"] = transformer.attn1_out_b; + tensors[transformer_prefix + "attn1.to_out.0.bias"] = transformer.attn1_out_b; tensors[transformer_prefix + "ff.net.0.proj.weight"] = transformer.ff_0_proj_w; - tensors[transformer_prefix + "ff.net.0.proj.bias"] = transformer.ff_0_proj_b; - tensors[transformer_prefix + "ff.net.2.weight"] = transformer.ff_2_w; - tensors[transformer_prefix + "ff.net.2.bias"] = transformer.ff_2_b; + tensors[transformer_prefix + "ff.net.0.proj.bias"] = transformer.ff_0_proj_b; + tensors[transformer_prefix + "ff.net.2.weight"] = transformer.ff_2_w; + tensors[transformer_prefix + "ff.net.2.bias"] = transformer.ff_2_b; tensors[transformer_prefix + "attn2.to_q.weight"] = transformer.attn2_q_w; tensors[transformer_prefix + "attn2.to_k.weight"] = transformer.attn2_k_w; tensors[transformer_prefix + "attn2.to_v.weight"] = transformer.attn2_v_w; tensors[transformer_prefix + "attn2.to_out.0.weight"] = transformer.attn2_out_w; - tensors[transformer_prefix + "attn2.to_out.0.bias"] = transformer.attn2_out_b; + tensors[transformer_prefix + "attn2.to_out.0.bias"] = transformer.attn2_out_b; tensors[transformer_prefix + "norm1.weight"] = transformer.norm1_w; - tensors[transformer_prefix + "norm1.bias"] = transformer.norm1_b; + tensors[transformer_prefix + "norm1.bias"] = transformer.norm1_b; tensors[transformer_prefix + "norm2.weight"] = transformer.norm2_w; - tensors[transformer_prefix + "norm2.bias"] = transformer.norm2_b; + tensors[transformer_prefix + "norm2.bias"] = transformer.norm2_b; tensors[transformer_prefix + "norm3.weight"] = transformer.norm3_w; - tensors[transformer_prefix + "norm3.bias"] = transformer.norm3_b; + tensors[transformer_prefix + "norm3.bias"] = transformer.norm3_b; } tensors[prefix + "proj_out.weight"] = proj_out_w; - tensors[prefix + "proj_out.bias"] = proj_out_b; + tensors[prefix + "proj_out.bias"] = proj_out_b; } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) { @@ -1229,12 +1229,12 @@ struct SpatialTransformer { x)); // [N, in_channels, h, w] // transformer - const int64_t n = x->ne[3]; - const int64_t c = x->ne[2]; - const int64_t h = x->ne[1]; - const int64_t w = x->ne[0]; + const int64_t n = x->ne[3]; + const int64_t c = x->ne[2]; + const int64_t h = x->ne[1]; + const int64_t w = x->ne[0]; const int64_t max_position = context->ne[1]; - x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, in_channels] + x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, in_channels] { auto r = x; @@ -1251,30 +1251,30 @@ struct SpatialTransformer { // self-attention { - x = ggml_reshape_2d(ctx, x, c, h * w * n); // [N * h * w, in_channels] + x = ggml_reshape_2d(ctx, x, c, h * w * n); // [N * h * w, in_channels] struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn1_q_w, x); // [N * h * w, in_channels] - q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_head))); - q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head] - q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, h * w, d_head] - q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head] + q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_head))); + q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head] + q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, h * w, d_head] + q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head] - struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn1_k_w, x); // [N * h * w, in_channels] - k = ggml_reshape_4d(ctx, k, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head] - k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, h * w, d_head] - k = ggml_reshape_3d(ctx, k, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head] + struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn1_k_w, x); // [N * h * w, in_channels] + k = ggml_reshape_4d(ctx, k, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head] + k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, h * w, d_head] + k = ggml_reshape_3d(ctx, k, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head] - struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn1_v_w, x); // [N * h * w, in_channels] - v = ggml_reshape_4d(ctx, v, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head] - v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, h * w] - v = ggml_reshape_3d(ctx, v, h * w, d_head, n_head * n); // [N * n_head, d_head, h * w] + struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn1_v_w, x); // [N * h * w, in_channels] + v = ggml_reshape_4d(ctx, v, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head] + v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, h * w] + v = ggml_reshape_3d(ctx, v, h * w, d_head, n_head * n); // [N * n_head, d_head, h * w] struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, h * w, h * w] // kq = ggml_diag_mask_inf_inplace(ctx, kq, 0); kq = ggml_soft_max_inplace(ctx, kq); struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, h * w, d_head] - kqv = ggml_reshape_4d(ctx, kqv, d_head, h * w, n_head, n); - kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, h * w, n_head, d_head] + kqv = ggml_reshape_4d(ctx, kqv, d_head, h * w, n_head, n); + kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, h * w, n_head, d_head] // x = ggml_cpy(ctx, kqv, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d_head * n_head, h * w * n)); x = ggml_reshape_2d(ctx, kqv, d_head * n_head, h * w * n); @@ -1298,24 +1298,24 @@ struct SpatialTransformer { // cross-attention { - x = ggml_reshape_2d(ctx, x, c, h * w * n); // [N * h * w, in_channels] - context = ggml_reshape_2d(ctx, context, context->ne[0], context->ne[1] * context->ne[2]); // [N * max_position, hidden_size] - struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn2_q_w, x); // [N * h * w, in_channels] + x = ggml_reshape_2d(ctx, x, c, h * w * n); // [N * h * w, in_channels] + context = ggml_reshape_2d(ctx, context, context->ne[0], context->ne[1] * context->ne[2]); // [N * max_position, hidden_size] + struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn2_q_w, x); // [N * h * w, in_channels] q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_head))); q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head] q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, h * w, d_head] q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head] - struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn2_k_w, context); // [N * max_position, in_channels] - k = ggml_reshape_4d(ctx, k, d_head, n_head, max_position, n); // [N, max_position, n_head, d_head] - k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, max_position, d_head] - k = ggml_reshape_3d(ctx, k, d_head, max_position, n_head * n); // [N * n_head, max_position, d_head] + struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn2_k_w, context); // [N * max_position, in_channels] + k = ggml_reshape_4d(ctx, k, d_head, n_head, max_position, n); // [N, max_position, n_head, d_head] + k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, max_position, d_head] + k = ggml_reshape_3d(ctx, k, d_head, max_position, n_head * n); // [N * n_head, max_position, d_head] - struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn2_v_w, context); // [N * max_position, in_channels] - v = ggml_reshape_4d(ctx, v, d_head, n_head, max_position, n); // [N, max_position, n_head, d_head] - v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, max_position] - v = ggml_reshape_3d(ctx, v, max_position, d_head, n_head * n); // [N * n_head, d_head, max_position] + struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn2_v_w, context); // [N * max_position, in_channels] + v = ggml_reshape_4d(ctx, v, d_head, n_head, max_position, n); // [N, max_position, n_head, d_head] + v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, max_position] + v = ggml_reshape_3d(ctx, v, max_position, d_head, n_head * n); // [N * n_head, d_head, max_position] struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, h * w, max_position] // kq = ggml_diag_mask_inf_inplace(ctx, kq, 0); @@ -1350,16 +1350,16 @@ struct SpatialTransformer { // ff { // GEGLU - auto x_w = ggml_view_2d(ctx, - transformer.ff_0_proj_w, - transformer.ff_0_proj_w->ne[0], - transformer.ff_0_proj_w->ne[1] / 2, - transformer.ff_0_proj_w->nb[1], - 0); // [in_channels * 4, in_channels] - auto x_b = ggml_view_1d(ctx, - transformer.ff_0_proj_b, - transformer.ff_0_proj_b->ne[0] / 2, - 0); // [in_channels * 4, in_channels] + auto x_w = ggml_view_2d(ctx, + transformer.ff_0_proj_w, + transformer.ff_0_proj_w->ne[0], + transformer.ff_0_proj_w->ne[1] / 2, + transformer.ff_0_proj_w->nb[1], + 0); // [in_channels * 4, in_channels] + auto x_b = ggml_view_1d(ctx, + transformer.ff_0_proj_b, + transformer.ff_0_proj_b->ne[0] / 2, + 0); // [in_channels * 4, in_channels] auto gate_w = ggml_view_2d(ctx, transformer.ff_0_proj_w, transformer.ff_0_proj_w->ne[0], @@ -1370,12 +1370,12 @@ struct SpatialTransformer { transformer.ff_0_proj_b, transformer.ff_0_proj_b->ne[0] / 2, transformer.ff_0_proj_b->nb[0] * transformer.ff_0_proj_b->ne[0] / 2); // [in_channels * 4, ] - x = ggml_reshape_2d(ctx, x, c, w * h * n); - auto x_in = x; - x = ggml_mul_mat(ctx, x_w, x_in); // [N * h * w, in_channels * 4] - x = ggml_add(ctx, ggml_repeat(ctx, x_b, x), x); - auto gate = ggml_mul_mat(ctx, gate_w, x_in); // [N * h * w, in_channels * 4] - gate = ggml_add(ctx, ggml_repeat(ctx, gate_b, gate), gate); + x = ggml_reshape_2d(ctx, x, c, w * h * n); + auto x_in = x; + x = ggml_mul_mat(ctx, x_w, x_in); // [N * h * w, in_channels * 4] + x = ggml_add(ctx, ggml_repeat(ctx, x_b, x), x); + auto gate = ggml_mul_mat(ctx, gate_w, x_in); // [N * h * w, in_channels * 4] + gate = ggml_add(ctx, ggml_repeat(ctx, gate_b, gate), gate); gate = ggml_gelu_inplace(ctx, gate); @@ -1431,10 +1431,10 @@ struct DownSample { void map_by_name(std::map& tensors, const std::string prefix) { if (vae_downsample) { tensors[prefix + "conv.weight"] = op_w; - tensors[prefix + "conv.bias"] = op_b; + tensors[prefix + "conv.bias"] = op_b; } else { tensors[prefix + "op.weight"] = op_w; - tensors[prefix + "op.bias"] = op_b; + tensors[prefix + "op.bias"] = op_b; } } @@ -1513,7 +1513,7 @@ struct UpSample { void map_by_name(std::map& tensors, const std::string prefix) { tensors[prefix + "conv.weight"] = conv_w; - tensors[prefix + "conv.bias"] = conv_b; + tensors[prefix + "conv.bias"] = conv_b; } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -1533,16 +1533,16 @@ struct UpSample { // ldm.modules.diffusionmodules.openaimodel.UNetModel struct UNetModel { // network hparams - int in_channels = 4; - int model_channels = 320; - int out_channels = 4; - int num_res_blocks = 2; + int in_channels = 4; + int model_channels = 320; + int out_channels = 4; + int num_res_blocks = 2; int attention_resolutions[3] = {4, 2, 1}; - int channel_mult[4] = {1, 2, 4, 4}; - int time_embed_dim = 1280; // model_channels*4 - int num_heads = 8; - int num_head_channels = -1; // channels // num_heads - int context_dim = 768; // 1024 for SD2.x + int channel_mult[4] = {1, 2, 4, 4}; + int time_embed_dim = 1280; // model_channels*4 + int num_heads = 8; + int num_head_channels = -1; // channels // num_heads + int context_dim = 768; // 1024 for SD2.x // network params struct ggml_tensor* time_embed_0_w; // [time_embed_dim, model_channels] @@ -1579,9 +1579,9 @@ struct UNetModel { UNetModel(ModelType model_type = SD1) { if (model_type == SD2) { - context_dim = 1024; + context_dim = 1024; num_head_channels = 64; - num_heads = -1; + num_heads = -1; } // set up hparams of blocks @@ -1595,7 +1595,7 @@ struct UNetModel { for (int i = 0; i < len_mults; i++) { int mult = channel_mult[i]; for (int j = 0; j < num_res_blocks; j++) { - input_res_blocks[i][j].channels = ch; + input_res_blocks[i][j].channels = ch; input_res_blocks[i][j].emb_channels = time_embed_dim; input_res_blocks[i][j].out_channels = mult * model_channels; @@ -1609,14 +1609,14 @@ struct UNetModel { n_head = ch / d_head; } input_transformers[i][j].in_channels = ch; - input_transformers[i][j].n_head = n_head; - input_transformers[i][j].d_head = d_head; + input_transformers[i][j].n_head = n_head; + input_transformers[i][j].d_head = d_head; input_transformers[i][j].context_dim = context_dim; } input_block_chans.push_back(ch); } if (i != len_mults - 1) { - input_down_samples[i].channels = ch; + input_down_samples[i].channels = ch; input_down_samples[i].out_channels = ch; input_block_chans.push_back(ch); @@ -1625,7 +1625,7 @@ struct UNetModel { } // middle blocks - middle_block_0.channels = ch; + middle_block_0.channels = ch; middle_block_0.emb_channels = time_embed_dim; middle_block_0.out_channels = ch; @@ -1636,11 +1636,11 @@ struct UNetModel { n_head = ch / d_head; } middle_block_1.in_channels = ch; - middle_block_1.n_head = n_head; - middle_block_1.d_head = d_head; + middle_block_1.n_head = n_head; + middle_block_1.d_head = d_head; middle_block_1.context_dim = context_dim; - middle_block_2.channels = ch; + middle_block_2.channels = ch; middle_block_2.emb_channels = time_embed_dim; middle_block_2.out_channels = ch; @@ -1651,7 +1651,7 @@ struct UNetModel { int ich = input_block_chans.back(); input_block_chans.pop_back(); - output_res_blocks[i][j].channels = ch + ich; + output_res_blocks[i][j].channels = ch + ich; output_res_blocks[i][j].emb_channels = time_embed_dim; output_res_blocks[i][j].out_channels = mult * model_channels; @@ -1665,13 +1665,13 @@ struct UNetModel { n_head = ch / d_head; } output_transformers[i][j].in_channels = ch; - output_transformers[i][j].n_head = n_head; - output_transformers[i][j].d_head = d_head; + output_transformers[i][j].n_head = n_head; + output_transformers[i][j].d_head = d_head; output_transformers[i][j].context_dim = context_dim; } if (i > 0 && j == num_res_blocks) { - output_up_samples[i - 1].channels = ch; + output_up_samples[i - 1].channels = ch; output_up_samples[i - 1].out_channels = ch; ds /= 2; @@ -1693,7 +1693,7 @@ struct UNetModel { mem_size += 6 * ggml_tensor_overhead(); // object overhead // input_blocks - int ds = 1; + int ds = 1; int len_mults = sizeof(channel_mult) / sizeof(int); for (int i = 0; i < len_mults; i++) { for (int j = 0; j < num_res_blocks; j++) { @@ -1750,8 +1750,8 @@ struct UNetModel { // input_blocks input_block_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, model_channels); input_block_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels); - int ds = 1; - int len_mults = sizeof(channel_mult) / sizeof(int); + int ds = 1; + int len_mults = sizeof(channel_mult) / sizeof(int); for (int i = 0; i < len_mults; i++) { for (int j = 0; j < num_res_blocks; j++) { input_res_blocks[i][j].init_params(ctx, wtype); @@ -1797,18 +1797,18 @@ struct UNetModel { void map_by_name(std::map& tensors, const std::string prefix) { tensors[prefix + "time_embed.0.weight"] = time_embed_0_w; - tensors[prefix + "time_embed.0.bias"] = time_embed_0_b; + tensors[prefix + "time_embed.0.bias"] = time_embed_0_b; tensors[prefix + "time_embed.2.weight"] = time_embed_2_w; - tensors[prefix + "time_embed.2.bias"] = time_embed_2_b; + tensors[prefix + "time_embed.2.bias"] = time_embed_2_b; // input_blocks tensors[prefix + "input_blocks.0.0.weight"] = input_block_0_w; - tensors[prefix + "input_blocks.0.0.bias"] = input_block_0_b; + tensors[prefix + "input_blocks.0.0.bias"] = input_block_0_b; - int len_mults = sizeof(channel_mult) / sizeof(int); + int len_mults = sizeof(channel_mult) / sizeof(int); int input_block_idx = 0; - int ds = 1; + int ds = 1; for (int i = 0; i < len_mults; i++) { for (int j = 0; j < num_res_blocks; j++) { input_block_idx += 1; @@ -1853,9 +1853,9 @@ struct UNetModel { // out tensors[prefix + "out.0.weight"] = out_0_w; - tensors[prefix + "out.0.bias"] = out_0_b; + tensors[prefix + "out.0.bias"] = out_0_b; tensors[prefix + "out.2.weight"] = out_2_w; - tensors[prefix + "out.2.bias"] = out_2_b; + tensors[prefix + "out.2.bias"] = out_2_b; } struct ggml_tensor* forward(struct ggml_context* ctx, @@ -1873,24 +1873,24 @@ struct UNetModel { // time_embed auto emb = ggml_mul_mat(ctx, time_embed_0_w, t_emb); - emb = ggml_add(ctx, ggml_repeat(ctx, time_embed_0_b, emb), emb); - emb = ggml_silu_inplace(ctx, emb); - emb = ggml_mul_mat(ctx, time_embed_2_w, emb); - emb = ggml_add(ctx, ggml_repeat(ctx, time_embed_2_b, emb), emb); // [N, time_embed_dim] + emb = ggml_add(ctx, ggml_repeat(ctx, time_embed_0_b, emb), emb); + emb = ggml_silu_inplace(ctx, emb); + emb = ggml_mul_mat(ctx, time_embed_2_w, emb); + emb = ggml_add(ctx, ggml_repeat(ctx, time_embed_2_b, emb), emb); // [N, time_embed_dim] // input_blocks std::vector hs; // input block 0 auto h = ggml_conv_2d(ctx, input_block_0_w, x, 1, 1, 1, 1, 1, 1); // [N, model_channels, h, w] - h = ggml_add(ctx, - h, - ggml_repeat(ctx, - ggml_reshape_4d(ctx, input_block_0_b, 1, 1, input_block_0_b->ne[0], 1), - h)); // [N, model_channels, h, w] + h = ggml_add(ctx, + h, + ggml_repeat(ctx, + ggml_reshape_4d(ctx, input_block_0_b, 1, 1, input_block_0_b->ne[0], 1), + h)); // [N, model_channels, h, w] hs.push_back(h); // input block 1-11 int len_mults = sizeof(channel_mult) / sizeof(int); - int ds = 1; + int ds = 1; for (int i = 0; i < len_mults; i++) { int mult = channel_mult[i]; for (int j = 0; j < num_res_blocks; j++) { @@ -2021,18 +2021,18 @@ struct ResnetBlock { void map_by_name(std::map& tensors, const std::string prefix) { tensors[prefix + "norm1.weight"] = norm1_w; - tensors[prefix + "norm1.bias"] = norm1_b; + tensors[prefix + "norm1.bias"] = norm1_b; tensors[prefix + "conv1.weight"] = conv1_w; - tensors[prefix + "conv1.bias"] = conv1_b; + tensors[prefix + "conv1.bias"] = conv1_b; tensors[prefix + "norm2.weight"] = norm2_w; - tensors[prefix + "norm2.bias"] = norm2_b; + tensors[prefix + "norm2.bias"] = norm2_b; tensors[prefix + "conv2.weight"] = conv2_w; - tensors[prefix + "conv2.bias"] = conv2_b; + tensors[prefix + "conv2.bias"] = conv2_b; if (out_channels != in_channels) { tensors[prefix + "nin_shortcut.weight"] = nin_shortcut_w; - tensors[prefix + "nin_shortcut.bias"] = nin_shortcut_b; + tensors[prefix + "nin_shortcut.bias"] = nin_shortcut_b; } } @@ -2041,16 +2041,16 @@ struct ResnetBlock { // group norm 32 auto h = ggml_group_norm_32(ctx, z); - h = ggml_mul(ctx, - ggml_repeat(ctx, - ggml_reshape_4d(ctx, norm1_w, 1, 1, norm1_w->ne[0], 1), - h), - h); - h = ggml_add(ctx, - h, - ggml_repeat(ctx, - ggml_reshape_4d(ctx, norm1_b, 1, 1, norm1_b->ne[0], 1), - h)); + h = ggml_mul(ctx, + ggml_repeat(ctx, + ggml_reshape_4d(ctx, norm1_w, 1, 1, norm1_w->ne[0], 1), + h), + h); + h = ggml_add(ctx, + h, + ggml_repeat(ctx, + ggml_reshape_4d(ctx, norm1_b, 1, 1, norm1_b->ne[0], 1), + h)); // silu h = ggml_silu_inplace(ctx, h); // conv2d @@ -2121,28 +2121,28 @@ struct AttnBlock { void init_params(struct ggml_context* ctx, ggml_type wtype) { norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); - q_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels); - q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); - k_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels); - k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); - v_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels); - v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); + q_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels); + q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); + k_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels); + k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); + v_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels); + v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels); proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); } void map_by_name(std::map& tensors, const std::string prefix) { - tensors[prefix + "norm.weight"] = norm_w; - tensors[prefix + "norm.bias"] = norm_b; - tensors[prefix + "q.weight"] = q_w; - tensors[prefix + "q.bias"] = q_b; - tensors[prefix + "k.weight"] = k_w; - tensors[prefix + "k.bias"] = k_b; - tensors[prefix + "v.weight"] = v_w; - tensors[prefix + "v.bias"] = v_b; + tensors[prefix + "norm.weight"] = norm_w; + tensors[prefix + "norm.bias"] = norm_b; + tensors[prefix + "q.weight"] = q_w; + tensors[prefix + "q.bias"] = q_b; + tensors[prefix + "k.weight"] = k_w; + tensors[prefix + "k.bias"] = k_b; + tensors[prefix + "v.weight"] = v_w; + tensors[prefix + "v.bias"] = v_b; tensors[prefix + "proj_out.weight"] = proj_out_w; - tensors[prefix + "proj_out.bias"] = proj_out_b; + tensors[prefix + "proj_out.bias"] = proj_out_b; } struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { @@ -2150,9 +2150,9 @@ struct AttnBlock { // group norm 32 auto h_ = ggml_group_norm_32(ctx, x); - h_ = ggml_add(ctx, - ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_w, 1, 1, norm_w->ne[0], 1), h_), h_), - ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_b, 1, 1, norm_b->ne[0], 1), h_)); + h_ = ggml_add(ctx, + ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_w, 1, 1, norm_w->ne[0], 1), h_), h_), + ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_b, 1, 1, norm_b->ne[0], 1), h_)); const int64_t n = h_->ne[3]; const int64_t c = h_->ne[2]; @@ -2160,27 +2160,27 @@ struct AttnBlock { const int64_t w = h_->ne[0]; // q auto q = ggml_conv_2d(ctx, q_w, h_, 1, 1, 0, 0, 1, 1); - q = ggml_add(ctx, - q, - ggml_repeat(ctx, - ggml_reshape_4d(ctx, q_b, 1, 1, q_b->ne[0], 1), - q)); // [N, in_channels, h, w] + q = ggml_add(ctx, + q, + ggml_repeat(ctx, + ggml_reshape_4d(ctx, q_b, 1, 1, q_b->ne[0], 1), + q)); // [N, in_channels, h, w] // k auto k = ggml_conv_2d(ctx, k_w, h_, 1, 1, 0, 0, 1, 1); - k = ggml_add(ctx, - k, - ggml_repeat(ctx, - ggml_reshape_4d(ctx, k_b, 1, 1, k_b->ne[0], 1), - k)); // [N, in_channels, h, w] + k = ggml_add(ctx, + k, + ggml_repeat(ctx, + ggml_reshape_4d(ctx, k_b, 1, 1, k_b->ne[0], 1), + k)); // [N, in_channels, h, w] // v auto v = ggml_conv_2d(ctx, v_w, h_, 1, 1, 0, 0, 1, 1); - v = ggml_add(ctx, - v, - ggml_repeat(ctx, - ggml_reshape_4d(ctx, v_b, 1, 1, v_b->ne[0], 1), - v)); // [N, in_channels, h, w] + v = ggml_add(ctx, + v, + ggml_repeat(ctx, + ggml_reshape_4d(ctx, v_b, 1, 1, v_b->ne[0], 1), + v)); // [N, in_channels, h, w] q = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels] q = ggml_reshape_3d(ctx, q, c, h * w, n); // [N, h * w, in_channels] @@ -2189,10 +2189,10 @@ struct AttnBlock { k = ggml_reshape_3d(ctx, k, c, h * w, n); // [N, h * w, in_channels] auto w_ = ggml_mul_mat(ctx, k, q); // [N, h * w, h * w] - w_ = ggml_scale_inplace(ctx, w_, ggml_new_f32(ctx, 1.0f / sqrt((float)c))); - w_ = ggml_soft_max_inplace(ctx, w_); + w_ = ggml_scale_inplace(ctx, w_, ggml_new_f32(ctx, 1.0f / sqrt((float)c))); + w_ = ggml_soft_max_inplace(ctx, w_); - v = ggml_reshape_3d(ctx, v, h * w, c, n); // [N, in_channels, h * w] + v = ggml_reshape_3d(ctx, v, h * w, c, n); // [N, in_channels, h * w] h_ = ggml_mul_mat(ctx, v, w_); // [N, h * w, in_channels] h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w] h_ = ggml_reshape_4d(ctx, h_, w, h, c, n); // [N, in_channels, h, w] @@ -2211,12 +2211,12 @@ struct AttnBlock { // ldm.modules.diffusionmodules.model.Encoder struct Encoder { - int embed_dim = 4; - int ch = 128; - int z_channels = 4; - int in_channels = 3; + int embed_dim = 4; + int ch = 128; + int z_channels = 4; + int in_channels = 3; int num_res_blocks = 2; - int ch_mult[4] = {1, 2, 4, 4}; + int ch_mult[4] = {1, 2, 4, 4}; struct ggml_tensor* conv_in_w; // [ch, in_channels, 3, 3] struct ggml_tensor* conv_in_b; // [ch, ] @@ -2250,28 +2250,28 @@ struct Encoder { } int block_out = ch * ch_mult[i]; for (int j = 0; j < num_res_blocks; j++) { - down_blocks[i][j].in_channels = block_in; + down_blocks[i][j].in_channels = block_in; down_blocks[i][j].out_channels = block_out; - block_in = block_out; + block_in = block_out; } if (i != len_mults - 1) { - down_samples[i].channels = block_in; - down_samples[i].out_channels = block_in; + down_samples[i].channels = block_in; + down_samples[i].out_channels = block_in; down_samples[i].vae_downsample = true; } } - mid.block_1.in_channels = block_in; + mid.block_1.in_channels = block_in; mid.block_1.out_channels = block_in; - mid.attn_1.in_channels = block_in; - mid.block_2.in_channels = block_in; + mid.attn_1.in_channels = block_in; + mid.block_2.in_channels = block_in; mid.block_2.out_channels = block_in; } size_t compute_params_mem_size(ggml_type wtype) { double mem_size = 0; - int len_mults = sizeof(ch_mult) / sizeof(int); - int block_in = ch * ch_mult[len_mults - 1]; + int len_mults = sizeof(ch_mult) / sizeof(int); + int block_in = ch * ch_mult[len_mults - 1]; mem_size += ch * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_in_w mem_size += ch * ggml_type_sizef(GGML_TYPE_F32); // conv_in_b @@ -2301,7 +2301,7 @@ struct Encoder { void init_params(struct ggml_context* ctx, ggml_type wtype) { int len_mults = sizeof(ch_mult) / sizeof(int); - int block_in = ch * ch_mult[len_mults - 1]; + int block_in = ch * ch_mult[len_mults - 1]; conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, ch); conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch); @@ -2328,11 +2328,11 @@ struct Encoder { void map_by_name(std::map& tensors, const std::string prefix) { tensors[prefix + "norm_out.weight"] = norm_out_w; - tensors[prefix + "norm_out.bias"] = norm_out_b; - tensors[prefix + "conv_in.weight"] = conv_in_w; - tensors[prefix + "conv_in.bias"] = conv_in_b; + tensors[prefix + "norm_out.bias"] = norm_out_b; + tensors[prefix + "conv_in.weight"] = conv_in_w; + tensors[prefix + "conv_in.bias"] = conv_in_b; tensors[prefix + "conv_out.weight"] = conv_out_w; - tensors[prefix + "conv_out.bias"] = conv_out_b; + tensors[prefix + "conv_out.bias"] = conv_out_b; mid.block_1.map_by_name(tensors, prefix + "mid.block_1."); mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1."); @@ -2353,12 +2353,12 @@ struct Encoder { // x: [N, in_channels, h, w] // conv_in - auto h = ggml_conv_2d(ctx, conv_in_w, x, 1, 1, 1, 1, 1, 1); - h = ggml_add(ctx, - h, - ggml_repeat(ctx, - ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1), - h)); // [N, ch, h, w] + auto h = ggml_conv_2d(ctx, conv_in_w, x, 1, 1, 1, 1, 1, 1); + h = ggml_add(ctx, + h, + ggml_repeat(ctx, + ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1), + h)); // [N, ch, h, w] int len_mults = sizeof(ch_mult) / sizeof(int); for (int i = 0; i < len_mults; i++) { for (int j = 0; j < num_res_blocks; j++) { @@ -2397,12 +2397,12 @@ struct Encoder { // ldm.modules.diffusionmodules.model.Decoder struct Decoder { - int embed_dim = 4; - int ch = 128; - int z_channels = 4; - int out_ch = 3; + int embed_dim = 4; + int ch = 128; + int z_channels = 4; + int out_ch = 3; int num_res_blocks = 2; - int ch_mult[4] = {1, 2, 4, 4}; + int ch_mult[4] = {1, 2, 4, 4}; // block_in = ch * ch_mult[-1], 512 struct ggml_tensor* conv_in_w; // [block_in, z_channels, 3, 3] @@ -2426,24 +2426,24 @@ struct Decoder { Decoder() { int len_mults = sizeof(ch_mult) / sizeof(int); - int block_in = ch * ch_mult[len_mults - 1]; + int block_in = ch * ch_mult[len_mults - 1]; - mid.block_1.in_channels = block_in; + mid.block_1.in_channels = block_in; mid.block_1.out_channels = block_in; - mid.attn_1.in_channels = block_in; - mid.block_2.in_channels = block_in; + mid.attn_1.in_channels = block_in; + mid.block_2.in_channels = block_in; mid.block_2.out_channels = block_in; for (int i = len_mults - 1; i >= 0; i--) { - int mult = ch_mult[i]; + int mult = ch_mult[i]; int block_out = ch * mult; for (int j = 0; j < num_res_blocks + 1; j++) { - up_blocks[i][j].in_channels = block_in; + up_blocks[i][j].in_channels = block_in; up_blocks[i][j].out_channels = block_out; - block_in = block_out; + block_in = block_out; } if (i != 0) { - up_samples[i - 1].channels = block_in; + up_samples[i - 1].channels = block_in; up_samples[i - 1].out_channels = block_in; } } @@ -2451,8 +2451,8 @@ struct Decoder { size_t compute_params_mem_size(ggml_type wtype) { double mem_size = 0; - int len_mults = sizeof(ch_mult) / sizeof(int); - int block_in = ch * ch_mult[len_mults - 1]; + int len_mults = sizeof(ch_mult) / sizeof(int); + int block_in = ch * ch_mult[len_mults - 1]; mem_size += block_in * z_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_in_w mem_size += block_in * ggml_type_sizef(GGML_TYPE_F32); // conv_in_b @@ -2482,7 +2482,7 @@ struct Decoder { void init_params(struct ggml_context* ctx, ggml_type wtype) { int len_mults = sizeof(ch_mult) / sizeof(int); - int block_in = ch * ch_mult[len_mults - 1]; + int block_in = ch * ch_mult[len_mults - 1]; norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]); norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]); @@ -2509,11 +2509,11 @@ struct Decoder { void map_by_name(std::map& tensors, const std::string prefix) { tensors[prefix + "norm_out.weight"] = norm_out_w; - tensors[prefix + "norm_out.bias"] = norm_out_b; - tensors[prefix + "conv_in.weight"] = conv_in_w; - tensors[prefix + "conv_in.bias"] = conv_in_b; + tensors[prefix + "norm_out.bias"] = norm_out_b; + tensors[prefix + "conv_in.weight"] = conv_in_w; + tensors[prefix + "conv_in.bias"] = conv_in_b; tensors[prefix + "conv_out.weight"] = conv_out_w; - tensors[prefix + "conv_out.bias"] = conv_out_b; + tensors[prefix + "conv_out.bias"] = conv_out_b; mid.block_1.map_by_name(tensors, prefix + "mid.block_1."); mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1."); @@ -2535,11 +2535,11 @@ struct Decoder { // conv_in auto h = ggml_conv_2d(ctx, conv_in_w, z, 1, 1, 1, 1, 1, 1); - h = ggml_add(ctx, - h, - ggml_repeat(ctx, - ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1), - h)); // [N, block_in, h, w] + h = ggml_add(ctx, + h, + ggml_repeat(ctx, + ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1), + h)); // [N, block_in, h, w] h = mid.block_1.forward(ctx, h); h = mid.attn_1.forward(ctx, h); @@ -2580,15 +2580,15 @@ struct Decoder { // ldm.models.autoencoder.AutoencoderKL struct AutoEncoderKL { bool decode_only = true; - int embed_dim = 4; + int embed_dim = 4; struct { - int z_channels = 4; - int resolution = 256; - int in_channels = 3; - int out_ch = 3; - int ch = 128; - int ch_mult[4] = {1, 2, 4, 4}; + int z_channels = 4; + int resolution = 256; + int in_channels = 3; + int out_ch = 3; + int ch = 128; + int ch_mult[4] = {1, 2, 4, 4}; int num_res_blocks = 2; } dd_config; @@ -2606,14 +2606,14 @@ struct AutoEncoderKL { assert(sizeof(dd_config.ch_mult) == sizeof(encoder.ch_mult)); assert(sizeof(dd_config.ch_mult) == sizeof(decoder.ch_mult)); - encoder.embed_dim = embed_dim; - decoder.embed_dim = embed_dim; - encoder.ch = dd_config.ch; - decoder.ch = dd_config.ch; - encoder.z_channels = dd_config.z_channels; - decoder.z_channels = dd_config.z_channels; - encoder.in_channels = dd_config.in_channels; - decoder.out_ch = dd_config.out_ch; + encoder.embed_dim = embed_dim; + decoder.embed_dim = embed_dim; + encoder.ch = dd_config.ch; + decoder.ch = dd_config.ch; + encoder.z_channels = dd_config.z_channels; + decoder.z_channels = dd_config.z_channels; + encoder.in_channels = dd_config.in_channels; + decoder.out_ch = dd_config.out_ch; encoder.num_res_blocks = dd_config.num_res_blocks; int len_mults = sizeof(dd_config.ch_mult) / sizeof(int); @@ -2654,12 +2654,12 @@ struct AutoEncoderKL { void map_by_name(std::map& tensors, const std::string prefix) { if (!decode_only) { tensors[prefix + "quant_conv.weight"] = quant_conv_w; - tensors[prefix + "quant_conv.bias"] = quant_conv_b; + tensors[prefix + "quant_conv.bias"] = quant_conv_b; encoder.map_by_name(tensors, prefix + "encoder."); } tensors[prefix + "post_quant_conv.weight"] = post_quant_conv_w; - tensors[prefix + "post_quant_conv.bias"] = post_quant_conv_b; + tensors[prefix + "post_quant_conv.bias"] = post_quant_conv_b; decoder.map_by_name(tensors, prefix + "decoder."); } @@ -2668,12 +2668,12 @@ struct AutoEncoderKL { // post_quant_conv auto h = ggml_conv_2d(ctx, post_quant_conv_w, z, 1, 1, 0, 0, 1, 1); - h = ggml_add(ctx, - h, - ggml_repeat(ctx, - ggml_reshape_4d(ctx, post_quant_conv_b, 1, 1, post_quant_conv_b->ne[0], 1), - h)); // [N, z_channels, h, w] - h = decoder.forward(ctx, h); + h = ggml_add(ctx, + h, + ggml_repeat(ctx, + ggml_reshape_4d(ctx, post_quant_conv_b, 1, 1, post_quant_conv_b->ne[0], 1), + h)); // [N, z_channels, h, w] + h = decoder.forward(ctx, h); return h; } @@ -2716,22 +2716,22 @@ struct SigmaSchedule { low_idx++; } } - low_idx = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2); + low_idx = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2); int high_idx = low_idx + 1; - float low = log_sigmas[low_idx]; + float low = log_sigmas[low_idx]; float high = log_sigmas[high_idx]; - float w = (low - log_sigma) / (low - high); - w = std::max(0.f, std::min(1.f, w)); - float t = (1.0f - w) * low_idx + w * high_idx; + float w = (low - log_sigma) / (low - high); + w = std::max(0.f, std::min(1.f, w)); + float t = (1.0f - w) * low_idx + w * high_idx; return t; } float t_to_sigma(float t) { - int low_idx = static_cast(std::floor(t)); - int high_idx = static_cast(std::ceil(t)); - float w = t - static_cast(low_idx); + int low_idx = static_cast(std::floor(t)); + int high_idx = static_cast(std::ceil(t)); + float w = t - static_cast(low_idx); float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx]; return std::exp(log_sigma); } @@ -2767,7 +2767,7 @@ struct KarrasSchedule : SigmaSchedule { // but does anybody ever bother to touch them? float sigma_min = 0.1f; float sigma_max = 10.f; - float rho = 7.f; + float rho = 7.f; std::vector result(n + 1); @@ -2783,7 +2783,7 @@ struct KarrasSchedule : SigmaSchedule { }; struct Denoiser { - std::shared_ptr schedule = std::make_shared(); + std::shared_ptr schedule = std::make_shared(); virtual std::vector get_scalings(float sigma) = 0; }; @@ -2792,7 +2792,7 @@ struct CompVisDenoiser : public Denoiser { std::vector get_scalings(float sigma) { float c_out = -sigma; - float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data); + float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data); return {c_out, c_in}; } }; @@ -2802,8 +2802,8 @@ struct CompVisVDenoiser : public Denoiser { std::vector get_scalings(float sigma) { float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data); - float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data); - float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data); + float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data); + float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data); return {c_skip, c_out, c_in}; } }; @@ -2811,23 +2811,23 @@ struct CompVisVDenoiser : public Denoiser { /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { - public: +public: ggml_context* clip_params_ctx = NULL; ggml_context* unet_params_ctx = NULL; - ggml_context* vae_params_ctx = NULL; + ggml_context* vae_params_ctx = NULL; - bool dynamic = true; - bool vae_decode_only = false; + bool dynamic = true; + bool vae_decode_only = false; bool free_params_immediately = false; - std::shared_ptr rng = std::make_shared(); - int32_t ftype = 1; - int n_threads = -1; - float scale_factor = 0.18215f; - size_t max_mem_size = 0; + std::shared_ptr rng = std::make_shared(); + int32_t ftype = 1; + int n_threads = -1; + float scale_factor = 0.18215f; + size_t max_mem_size = 0; size_t curr_params_mem_size = 0; - size_t max_params_mem_size = 0; - size_t max_rt_mem_size = 0; + size_t max_params_mem_size = 0; + size_t max_rt_mem_size = 0; FrozenCLIPEmbedderWithCustomWords cond_stage_model; UNetModel diffusion_model; @@ -2924,7 +2924,7 @@ class StableDiffusionGGML { if (model_type == SD2) { cond_stage_model = FrozenCLIPEmbedderWithCustomWords((ModelType)model_type); - diffusion_model = UNetModel((ModelType)model_type); + diffusion_model = UNetModel((ModelType)model_type); } ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(ftype & 0xFFFF)); @@ -2970,10 +2970,10 @@ class StableDiffusionGGML { LOG_DEBUG("clip params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0)); struct ggml_init_params params; - params.mem_size = static_cast(ctx_size); + params.mem_size = static_cast(ctx_size); params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = false; + params.no_alloc = false; + params.dynamic = false; clip_params_ctx = ggml_init(params); if (!clip_params_ctx) { @@ -2989,10 +2989,10 @@ class StableDiffusionGGML { LOG_DEBUG("unet params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0)); struct ggml_init_params params; - params.mem_size = static_cast(ctx_size); + params.mem_size = static_cast(ctx_size); params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = false; + params.no_alloc = false; + params.dynamic = false; unet_params_ctx = ggml_init(params); if (!unet_params_ctx) { @@ -3010,10 +3010,10 @@ class StableDiffusionGGML { LOG_DEBUG("vae params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0)); struct ggml_init_params params; - params.mem_size = static_cast(ctx_size); + params.mem_size = static_cast(ctx_size); params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = false; + params.no_alloc = false; + params.dynamic = false; vae_params_ctx = ggml_init(params); if (!vae_params_ctx) { @@ -3048,7 +3048,7 @@ class StableDiffusionGGML { // load weights float alphas_cumprod[TIMESTEPS]; { - int n_tensors = 0; + int n_tensors = 0; size_t total_size = 0; while (true) { @@ -3065,7 +3065,7 @@ class StableDiffusionGGML { } int32_t nelements = 1; - int32_t ne[4] = {1, 1, 1, 1}; + int32_t ne[4] = {1, 1, 1, 1}; for (int i = 0; i < n_dims; ++i) { file.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); nelements *= ne[i]; @@ -3147,8 +3147,8 @@ class StableDiffusionGGML { } LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0); } - max_params_mem_size = ggml_used_mem(clip_params_ctx) + ggml_used_mem(unet_params_ctx) + ggml_used_mem(vae_params_ctx); - max_mem_size = max_params_mem_size; + max_params_mem_size = ggml_used_mem(clip_params_ctx) + ggml_used_mem(unet_params_ctx) + ggml_used_mem(vae_params_ctx); + max_mem_size = max_params_mem_size; curr_params_mem_size = max_params_mem_size; LOG_INFO("total params size = %.2fMB (clip %.2fMB, unet %.2fMB, vae %.2fMB)", max_params_mem_size / 1024.0 / 1024.0, @@ -3163,10 +3163,10 @@ class StableDiffusionGGML { bool is_using_v_parameterization = false; if (model_type == SD2) { struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024) * 1024; // 10M - params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = false; + params.mem_size = static_cast(10 * 1024) * 1024; // 10M + params.mem_buffer = NULL; + params.no_alloc = false; + params.dynamic = false; struct ggml_context* ctx = ggml_init(params); if (!ctx) { LOG_ERROR("ggml_init() failed"); @@ -3205,8 +3205,8 @@ class StableDiffusionGGML { for (int i = 0; i < TIMESTEPS; i++) { denoiser->schedule->alphas_cumprod[i] = alphas_cumprod[i]; - denoiser->schedule->sigmas[i] = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]); - denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]); + denoiser->schedule->sigmas[i] = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]); + denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]); } return true; @@ -3224,10 +3224,10 @@ class StableDiffusionGGML { // calculate the amount of memory required { struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = true; - params.dynamic = dynamic; + params.no_alloc = true; + params.dynamic = dynamic; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -3236,15 +3236,15 @@ class StableDiffusionGGML { } ggml_set_dynamic(ctx, false); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ] - struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels] + struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ] + struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels] ggml_set_dynamic(ctx, params.dynamic); struct ggml_tensor* out = diffusion_model.forward(ctx, x_t, NULL, c, t_emb); ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx); struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out); - cplan = ggml_graph_plan(diffusion_graph, n_threads); + cplan = ggml_graph_plan(diffusion_graph, n_threads); ctx_size += cplan.work_size; LOG_DEBUG("diffusion context need %.2fMB static memory, with work_size needing %.2fMB", @@ -3255,10 +3255,10 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = dynamic; + params.no_alloc = false; + params.dynamic = dynamic; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -3267,8 +3267,8 @@ class StableDiffusionGGML { } ggml_set_dynamic(ctx, false); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ] - struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels] + struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ] + struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels] ggml_set_dynamic(ctx, params.dynamic); ggml_set_f32(timesteps, 999); set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels); @@ -3277,7 +3277,7 @@ class StableDiffusionGGML { ggml_hold_dynamic_tensor(out); struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out); - cplan = ggml_graph_plan(diffusion_graph, n_threads); + cplan = ggml_graph_plan(diffusion_graph, n_threads); ggml_set_dynamic(ctx, false); struct ggml_tensor* buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size); @@ -3291,7 +3291,7 @@ class StableDiffusionGGML { double result = 0.f; { - float* vec_x = (float*)x_t->data; + float* vec_x = (float*)x_t->data; float* vec_out = (float*)out->data; int64_t n = ggml_nelements(out); @@ -3369,10 +3369,10 @@ class StableDiffusionGGML { struct ggml_init_params params; size_t ctx_size = 10 * 1024 * 1024; // 10 MB, for padding ctx_size += file_size; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = false; + params.no_alloc = false; + params.dynamic = false; LOG_DEBUG("lora '%s' params ctx size = % 6.2f MB", lora_name.c_str(), ctx_size / (1024.0 * 1024.0)); ggml_context* lora_params_ctx = ggml_init(params); if (!lora_params_ctx) { @@ -3385,7 +3385,7 @@ class StableDiffusionGGML { int64_t t0 = ggml_time_ms(); // load weights { - int n_tensors = 0; + int n_tensors = 0; size_t total_size = 0; while (true) { @@ -3402,7 +3402,7 @@ class StableDiffusionGGML { } int32_t nelements = 1; - int32_t ne[4] = {1, 1, 1, 1}; + int32_t ne[4] = {1, 1, 1, 1}; for (int i = 0; i < n_dims; ++i) { file.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); nelements *= ne[i]; @@ -3416,7 +3416,7 @@ class StableDiffusionGGML { // LOG_DEBUG("load lora tensor %s", name.c_str()); - int64_t ne64[4] = {ne[0], ne[1], ne[2], ne[3]}; + int64_t ne64[4] = {ne[0], ne[1], ne[2], ne[3]}; struct ggml_tensor* tensor = ggml_new_tensor(lora_params_ctx, (ggml_type)ttype, n_dims, ne64); file.read(reinterpret_cast(tensor->data), num_bytes); @@ -3426,7 +3426,7 @@ class StableDiffusionGGML { } } lora_tensors[lora_name] = lora_tensor_map; - int64_t t1 = ggml_time_ms(); + int64_t t1 = ggml_time_ms(); LOG_INFO("lora '%s' params size = %.2fMB", lora_name.c_str(), ggml_used_mem(lora_params_ctx) / 1024.0 / 1024.0); @@ -3452,7 +3452,7 @@ class StableDiffusionGGML { return; } - size_t ctx_size = 500 * 1024 * 1024; // 500MB + size_t ctx_size = 500 * 1024 * 1024; // 500MB void* mem_buffer = malloc(ctx_size); if (!mem_buffer) { if (free_params_immediately) { @@ -3466,8 +3466,8 @@ class StableDiffusionGGML { std::set applied_lora_tensors; for (auto& kv : tensors) { const std::string name = kv.first; - ggml_tensor* weight = kv.second; - std::string ending = ".weight"; + ggml_tensor* weight = kv.second; + std::string ending = ".weight"; if (!ends_with(name, ending)) { continue; } @@ -3475,12 +3475,12 @@ class StableDiffusionGGML { // find corresponding lora tensors std::string network_name = name.substr(0, name.size() - ending.size()); // remove .weight replace_all_chars(network_name, '.', '_'); - std::string lora_up_name = network_name + ".lora_up.weight"; + std::string lora_up_name = network_name + ".lora_up.weight"; std::string lora_down_name = network_name + ".lora_down.weight"; - std::string alpha_name = network_name + ".alpha"; - std::string scale_name = network_name + ".scale"; + std::string alpha_name = network_name + ".alpha"; + std::string scale_name = network_name + ".scale"; - ggml_tensor* lora_up = NULL; + ggml_tensor* lora_up = NULL; ggml_tensor* lora_down = NULL; float scale = 1.0f; @@ -3508,10 +3508,10 @@ class StableDiffusionGGML { int64_t dim = lora_down->ne[lora_down->n_dims - 1]; if (lora_tensor_map.find(scale_name) != lora_tensor_map.end()) { ggml_tensor* t = lora_tensor_map[scale_name]; - scale = ggml_get_f32_1d(t, 0); + scale = ggml_get_f32_1d(t, 0); } else if (lora_tensor_map.find(alpha_name) != lora_tensor_map.end()) { ggml_tensor* t = lora_tensor_map[alpha_name]; - scale = ggml_get_f32_1d(t, 0) / dim; + scale = ggml_get_f32_1d(t, 0) / dim; } // LOG_DEBUG("scale: %f %ld", scale, dim); @@ -3521,10 +3521,10 @@ class StableDiffusionGGML { // apply { struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = mem_buffer; - params.no_alloc = false; - params.dynamic = false; + params.no_alloc = false; + params.dynamic = false; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -3538,21 +3538,21 @@ class StableDiffusionGGML { ggml_tensor* scale_factor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); ggml_set_f32_1d(scale_factor, 0, scale); - int64_t lora_up_size_0 = lora_up->ne[lora_up->n_dims - 1]; - lora_up = ggml_reshape_2d(ctx, lora_up, ggml_nelements(lora_up) / lora_up_size_0, lora_up_size_0); + int64_t lora_up_size_0 = lora_up->ne[lora_up->n_dims - 1]; + lora_up = ggml_reshape_2d(ctx, lora_up, ggml_nelements(lora_up) / lora_up_size_0, lora_up_size_0); int64_t lora_down_size_0 = lora_down->ne[lora_down->n_dims - 1]; - lora_down = ggml_reshape_2d(ctx, lora_down, ggml_nelements(lora_down) / lora_down_size_0, lora_down_size_0); + lora_down = ggml_reshape_2d(ctx, lora_down, ggml_nelements(lora_down) / lora_down_size_0, lora_down_size_0); lora_down = ggml_cont(ctx, ggml_transpose(ctx, lora_down)); if (lora_down->type != GGML_TYPE_F32) { ggml_tensor* lora_down_f32 = ggml_new_tensor(ctx, GGML_TYPE_F32, lora_down->n_dims, lora_down->ne); - lora_down = ggml_cpy_inplace(ctx, lora_down, lora_down_f32); + lora_down = ggml_cpy_inplace(ctx, lora_down, lora_down_f32); } ggml_tensor* updown = ggml_mul_mat(ctx, lora_up, lora_down); - updown = ggml_cont(ctx, ggml_transpose(ctx, updown)); - updown = ggml_reshape(ctx, updown, weight); + updown = ggml_cont(ctx, ggml_transpose(ctx, updown)); + updown = ggml_reshape(ctx, updown, weight); GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight)); @@ -3596,7 +3596,7 @@ class StableDiffusionGGML { std::unordered_map lora_state_diff; for (auto& kv : lora_state) { const std::string& lora_name = kv.first; - float multiplier = kv.second; + float multiplier = kv.second; if (curr_lora_state.find(lora_name) != curr_lora_state.end()) { float curr_multiplier = curr_lora_state[lora_name]; @@ -3617,20 +3617,20 @@ class StableDiffusionGGML { } ggml_tensor* get_learned_condition(ggml_context* res_ctx, const std::string& text) { - auto tokens_and_weights = cond_stage_model.tokenize(text, - cond_stage_model.text_model.max_position_embeddings, - true); - std::vector& tokens = tokens_and_weights.first; + auto tokens_and_weights = cond_stage_model.tokenize(text, + cond_stage_model.text_model.max_position_embeddings, + true); + std::vector& tokens = tokens_and_weights.first; std::vector& weights = tokens_and_weights.second; struct ggml_cplan cplan; size_t ctx_size = 10 * 1024 * 1024; // 10MB // calculate the amount of memory required { struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = true; - params.dynamic = dynamic; + params.no_alloc = true; + params.dynamic = dynamic; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -3645,7 +3645,7 @@ class StableDiffusionGGML { struct ggml_tensor* hidden_states = cond_stage_model.text_model.forward(ctx, input_ids); struct ggml_cgraph* cond_graph = ggml_build_forward_ctx(ctx, hidden_states); - cplan = ggml_graph_plan(cond_graph, n_threads); + cplan = ggml_graph_plan(cond_graph, n_threads); ctx_size += cplan.work_size; ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx); @@ -3657,10 +3657,10 @@ class StableDiffusionGGML { // allocate the required memory and compute forward struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = dynamic; + params.no_alloc = false; + params.dynamic = dynamic; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -3673,7 +3673,7 @@ class StableDiffusionGGML { ggml_set_dynamic(ctx, params.dynamic); struct ggml_tensor* hidden_states = cond_stage_model.text_model.forward(ctx, input_ids); - struct ggml_cgraph* cond_graph = ggml_build_forward_ctx(ctx, hidden_states); + struct ggml_cgraph* cond_graph = ggml_build_forward_ctx(ctx, hidden_states); LOG_DEBUG("building condition graph completed: %d nodes, %d leafs", cond_graph->n_nodes, cond_graph->n_leafs); @@ -3687,10 +3687,10 @@ class StableDiffusionGGML { ggml_tensor* result = ggml_dup_tensor(res_ctx, hidden_states); // [N, n_token, hidden_size] { - int64_t nelements = ggml_nelements(hidden_states); + int64_t nelements = ggml_nelements(hidden_states); float original_mean = 0.f; - float new_mean = 0.f; - float* vec = (float*)hidden_states->data; + float new_mean = 0.f; + float* vec = (float*)hidden_states->data; for (int i = 0; i < nelements; i++) { original_mean += vec[i] / nelements * 1.0f; } @@ -3762,10 +3762,10 @@ class StableDiffusionGGML { // calculate the amount of memory required { struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = true; - params.dynamic = dynamic; + params.no_alloc = true; + params.dynamic = dynamic; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -3775,16 +3775,16 @@ class StableDiffusionGGML { ggml_set_dynamic(ctx, false); struct ggml_tensor* noised_input = ggml_dup_tensor(ctx, x_t); - struct ggml_tensor* context = ggml_dup_tensor(ctx, c); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ] - struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels] + struct ggml_tensor* context = ggml_dup_tensor(ctx, c); + struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ] + struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels] ggml_set_dynamic(ctx, params.dynamic); struct ggml_tensor* out = diffusion_model.forward(ctx, noised_input, NULL, context, t_emb); ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx); struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out); - cplan = ggml_graph_plan(diffusion_graph, n_threads); + cplan = ggml_graph_plan(diffusion_graph, n_threads); ctx_size += cplan.work_size; LOG_DEBUG("diffusion context need %.2fMB static memory, with work_size needing %.2fMB", @@ -3795,10 +3795,10 @@ class StableDiffusionGGML { } struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = dynamic; + params.no_alloc = false; + params.dynamic = dynamic; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -3808,16 +3808,16 @@ class StableDiffusionGGML { ggml_set_dynamic(ctx, false); struct ggml_tensor* noised_input = ggml_dup_tensor(ctx, x_t); - struct ggml_tensor* context = ggml_dup_tensor(ctx, c); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ] - struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels] + struct ggml_tensor* context = ggml_dup_tensor(ctx, c); + struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); // [N, ] + struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels); // [N, model_channels] ggml_set_dynamic(ctx, params.dynamic); struct ggml_tensor* out = diffusion_model.forward(ctx, noised_input, NULL, context, t_emb); ggml_hold_dynamic_tensor(out); struct ggml_cgraph* diffusion_graph = ggml_new_graph(ctx); - diffusion_graph->order = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT; + diffusion_graph->order = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT; ggml_build_forward_expand(diffusion_graph, out); cplan = ggml_graph_plan(diffusion_graph, n_threads); @@ -3837,7 +3837,7 @@ class StableDiffusionGGML { // denoise wrapper ggml_set_dynamic(ctx, false); - struct ggml_tensor* out_cond = NULL; + struct ggml_tensor* out_cond = NULL; struct ggml_tensor* out_uncond = NULL; if (cfg_scale != 1.0f && uc != NULL) { out_uncond = ggml_dup_tensor(ctx, x); @@ -3848,17 +3848,17 @@ class StableDiffusionGGML { auto denoise = [&](ggml_tensor* input, float sigma, int step) { int64_t t0 = ggml_time_ms(); - float c_skip = 1.0f; - float c_out = 1.0f; - float c_in = 1.0f; + float c_skip = 1.0f; + float c_out = 1.0f; + float c_in = 1.0f; std::vector scaling = denoiser->get_scalings(sigma); if (scaling.size() == 3) { // CompVisVDenoiser c_skip = scaling[0]; - c_out = scaling[1]; - c_in = scaling[2]; + c_out = scaling[1]; + c_in = scaling[2]; } else { // CompVisDenoiser c_out = scaling[0]; - c_in = scaling[1]; + c_in = scaling[1]; } float t = denoiser->schedule->sigma_to_t(sigma); @@ -3888,9 +3888,9 @@ class StableDiffusionGGML { // out_uncond + cfg_scale * (out_cond - out_uncond) { - float* vec_out = (float*)out->data; + float* vec_out = (float*)out->data; float* vec_out_uncond = (float*)out_uncond->data; - float* vec_out_cond = (float*)out_cond->data; + float* vec_out_cond = (float*)out_cond->data; for (int i = 0; i < ggml_nelements(out); i++) { vec_out[i] = vec_out_uncond[i] + cfg_scale * (vec_out_cond[i] - vec_out_uncond[i]); @@ -3906,8 +3906,8 @@ class StableDiffusionGGML { // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) { float* vec_denoised = (float*)denoised->data; - float* vec_input = (float*)input->data; - float* vec_out = (float*)out->data; + float* vec_input = (float*)input->data; + float* vec_out = (float*)out->data; for (int i = 0; i < ggml_nelements(denoised); i++) { vec_denoised[i] = vec_out[i] * c_out + vec_input[i] * c_skip; @@ -3934,7 +3934,7 @@ class StableDiffusionGGML { LOG_INFO("sampling using Euler A method"); ggml_set_dynamic(ctx, false); struct ggml_tensor* noise = ggml_dup_tensor(ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(ctx, x); ggml_set_dynamic(ctx, params.dynamic); for (int i = 0; i < steps; i++) { @@ -3945,8 +3945,8 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; float* vec_denoised = (float*)denoised->data; for (int i = 0; i < ggml_nelements(d); i++) { @@ -3955,8 +3955,8 @@ class StableDiffusionGGML { } // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); // Euler method @@ -3976,7 +3976,7 @@ class StableDiffusionGGML { ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); { - float* vec_x = (float*)x->data; + float* vec_x = (float*)x->data; float* vec_noise = (float*)noise->data; for (int i = 0; i < ggml_nelements(x); i++) { @@ -4001,8 +4001,8 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(d); j++) { @@ -4025,7 +4025,7 @@ class StableDiffusionGGML { case HEUN: { LOG_INFO("sampling using Heun method"); ggml_set_dynamic(ctx, false); - struct ggml_tensor* d = ggml_dup_tensor(ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(ctx, x); struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x); ggml_set_dynamic(ctx, params.dynamic); @@ -4035,8 +4035,8 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { @@ -4056,9 +4056,9 @@ class StableDiffusionGGML { } } else { // Heun step - float* vec_d = (float*)d->data; + float* vec_d = (float*)d->data; float* vec_d2 = (float*)d->data; - float* vec_x = (float*)x->data; + float* vec_x = (float*)x->data; float* vec_x2 = (float*)x2->data; for (int j = 0; j < ggml_nelements(x); j++) { @@ -4078,7 +4078,7 @@ class StableDiffusionGGML { case DPM2: { LOG_INFO("sampling using DPM2 method"); ggml_set_dynamic(ctx, false); - struct ggml_tensor* d = ggml_dup_tensor(ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(ctx, x); struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x); ggml_set_dynamic(ctx, params.dynamic); @@ -4088,8 +4088,8 @@ class StableDiffusionGGML { // d = (x - denoised) / sigma { - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { @@ -4100,7 +4100,7 @@ class StableDiffusionGGML { if (sigmas[i + 1] == 0) { // Euler step // x = x + d * dt - float dt = sigmas[i + 1] - sigmas[i]; + float dt = sigmas[i + 1] - sigmas[i]; float* vec_d = (float*)d->data; float* vec_x = (float*)x->data; @@ -4110,11 +4110,11 @@ class StableDiffusionGGML { } else { // DPM-Solver-2 float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1]))); - float dt_1 = sigma_mid - sigmas[i]; - float dt_2 = sigmas[i + 1] - sigmas[i]; + float dt_1 = sigma_mid - sigmas[i]; + float dt_2 = sigmas[i + 1] - sigmas[i]; - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; float* vec_x2 = (float*)x2->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x2[j] = vec_x[j] + vec_d[j] * dt_1; @@ -4134,8 +4134,8 @@ class StableDiffusionGGML { LOG_INFO("sampling using DPM++ (2s) a method"); ggml_set_dynamic(ctx, false); struct ggml_tensor* noise = ggml_dup_tensor(ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(ctx, x); - struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(ctx, x); + struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x); ggml_set_dynamic(ctx, params.dynamic); for (int i = 0; i < steps; i++) { @@ -4143,16 +4143,16 @@ class StableDiffusionGGML { denoise(x, sigmas[i], i + 1); // get_ancestral_step - float sigma_up = std::min(sigmas[i + 1], - std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); + float sigma_up = std::min(sigmas[i + 1], + std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i]))); float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up); - auto t_fn = [](float sigma) -> float { return -log(sigma); }; - auto sigma_fn = [](float t) -> float { return exp(-t); }; + auto t_fn = [](float sigma) -> float { return -log(sigma); }; + auto sigma_fn = [](float t) -> float { return exp(-t); }; if (sigma_down == 0) { // Euler step - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(d); j++) { @@ -4169,14 +4169,14 @@ class StableDiffusionGGML { } } else { // DPM-Solver++(2S) - float t = t_fn(sigmas[i]); + float t = t_fn(sigmas[i]); float t_next = t_fn(sigma_down); - float h = t_next - t; - float s = t + 0.5f * h; + float h = t_next - t; + float s = t + 0.5f * h; - float* vec_d = (float*)d->data; - float* vec_x = (float*)x->data; - float* vec_x2 = (float*)x2->data; + float* vec_d = (float*)d->data; + float* vec_x = (float*)x->data; + float* vec_x2 = (float*)x2->data; float* vec_denoised = (float*)denoised->data; // First half-step @@ -4196,7 +4196,7 @@ class StableDiffusionGGML { if (sigmas[i + 1] > 0) { ggml_tensor_set_f32_randn(noise, rng); { - float* vec_x = (float*)x->data; + float* vec_x = (float*)x->data; float* vec_noise = (float*)noise->data; for (int i = 0; i < ggml_nelements(x); i++) { @@ -4219,13 +4219,13 @@ class StableDiffusionGGML { // denoise denoise(x, sigmas[i], i + 1); - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float b = exp(-h) - 1.f; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float b = exp(-h) - 1.f; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; float* vec_old_denoised = (float*)old_denoised->data; if (i == 0 || sigmas[i + 1] == 0) { @@ -4235,10 +4235,10 @@ class StableDiffusionGGML { } } else { float h_last = t - t_fn(sigmas[i - 1]); - float r = h_last / h; + float r = h_last / h; for (int j = 0; j < ggml_nelements(x); j++) { float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; - vec_x[j] = a * vec_x[j] - b * denoised_d; + vec_x[j] = a * vec_x[j] - b * denoised_d; } } @@ -4261,12 +4261,12 @@ class StableDiffusionGGML { // denoise denoise(x, sigmas[i], i + 1); - float t = t_fn(sigmas[i]); - float t_next = t_fn(sigmas[i + 1]); - float h = t_next - t; - float a = sigmas[i + 1] / sigmas[i]; - float* vec_x = (float*)x->data; - float* vec_denoised = (float*)denoised->data; + float t = t_fn(sigmas[i]); + float t_next = t_fn(sigmas[i + 1]); + float h = t_next - t; + float a = sigmas[i + 1] / sigmas[i]; + float* vec_x = (float*)x->data; + float* vec_denoised = (float*)denoised->data; float* vec_old_denoised = (float*)old_denoised->data; if (i == 0 || sigmas[i + 1] == 0) { @@ -4277,14 +4277,14 @@ class StableDiffusionGGML { } } else { float h_last = t - t_fn(sigmas[i - 1]); - float h_min = std::min(h_last, h); - float h_max = std::max(h_last, h); - float r = h_max / h_min; - float h_d = (h_max + h_min) / 2.f; - float b = exp(-h_d) - 1.f; + float h_min = std::min(h_last, h); + float h_max = std::max(h_last, h); + float r = h_max / h_min; + float h_d = (h_max + h_min) / 2.f; + float b = exp(-h_d) - 1.f; for (int j = 0; j < ggml_nelements(x); j++) { float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j]; - vec_x[j] = a * vec_x[j] - b * denoised_d; + vec_x[j] = a * vec_x[j] - b * denoised_d; } } @@ -4299,7 +4299,7 @@ class StableDiffusionGGML { LOG_INFO("sampling using LCM method"); ggml_set_dynamic(ctx, false); struct ggml_tensor* noise = ggml_dup_tensor(ctx, x); - struct ggml_tensor* d = ggml_dup_tensor(ctx, x); + struct ggml_tensor* d = ggml_dup_tensor(ctx, x); ggml_set_dynamic(ctx, params.dynamic); for (int i = 0; i < steps; i++) { @@ -4310,7 +4310,7 @@ class StableDiffusionGGML { // x = denoised { - float* vec_x = (float*)x->data; + float* vec_x = (float*)x->data; float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { vec_x[j] = vec_denoised[j]; @@ -4322,7 +4322,7 @@ class StableDiffusionGGML { ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin"); { - float* vec_x = (float*)x->data; + float* vec_x = (float*)x->data; float* vec_noise = (float*)noise->data; for (int j = 0; j < ggml_nelements(x); j++) { @@ -4365,8 +4365,8 @@ class StableDiffusionGGML { } ggml_tensor* encode_first_stage(ggml_context* res_ctx, ggml_tensor* x) { - int64_t W = x->ne[0]; - int64_t H = x->ne[1]; + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; struct ggml_tensor* result = NULL; struct ggml_cplan cplan; @@ -4374,10 +4374,10 @@ class StableDiffusionGGML { size_t ctx_size = 10 * 1024 * 1024; // 10MB { struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = true; - params.dynamic = dynamic; + params.no_alloc = true; + params.dynamic = dynamic; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -4389,7 +4389,7 @@ class StableDiffusionGGML { ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx); struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, moments); - cplan = ggml_graph_plan(vae_graph, n_threads); + cplan = ggml_graph_plan(vae_graph, n_threads); ctx_size += cplan.work_size; LOG_DEBUG("vae context need %.2fMB static memory, with work_size needing %.2fMB", @@ -4401,10 +4401,10 @@ class StableDiffusionGGML { { struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = dynamic; + params.no_alloc = false; + params.dynamic = dynamic; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -4415,7 +4415,7 @@ class StableDiffusionGGML { struct ggml_tensor* moments = first_stage_model.encode(ctx, x); struct ggml_cgraph* vae_graph = ggml_new_graph(ctx); - vae_graph->order = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT; + vae_graph->order = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT; ggml_build_forward_expand(vae_graph, moments); int64_t t0 = ggml_time_ms(); @@ -4460,26 +4460,26 @@ class StableDiffusionGGML { // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding ggml_tensor* get_first_stage_encoding(ggml_context* res_ctx, ggml_tensor* moments) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample - ggml_tensor* latent = ggml_new_tensor_4d(res_ctx, moments->type, moments->ne[0], - moments->ne[1], moments->ne[2] / 2, moments->ne[3]); + ggml_tensor* latent = ggml_new_tensor_4d(res_ctx, moments->type, moments->ne[0], + moments->ne[1], moments->ne[2] / 2, moments->ne[3]); struct ggml_tensor* noise = ggml_dup_tensor(res_ctx, latent); ggml_tensor_set_f32_randn(noise, rng); // noise = load_tensor_from_file(res_ctx, "noise.bin"); { - float mean = 0; + float mean = 0; float logvar = 0; - float value = 0; - float std_ = 0; + float value = 0; + float std_ = 0; for (int i = 0; i < latent->ne[3]; i++) { for (int j = 0; j < latent->ne[2]; j++) { for (int k = 0; k < latent->ne[1]; k++) { for (int l = 0; l < latent->ne[0]; l++) { - mean = ggml_tensor_get_f32(moments, l, k, j, i); + mean = ggml_tensor_get_f32(moments, l, k, j, i); logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i); logvar = std::max(-30.0f, std::min(logvar, 20.0f)); - std_ = std::exp(0.5f * logvar); - value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); - value = value * scale_factor; + std_ = std::exp(0.5f * logvar); + value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); + value = value * scale_factor; // printf("%d %d %d %d -> %f\n", i, j, k, l, value); ggml_tensor_set_f32(latent, value, l, k, j, i); } @@ -4491,8 +4491,8 @@ class StableDiffusionGGML { } ggml_tensor* decode_first_stage(ggml_context* res_ctx, ggml_tensor* z) { - int64_t W = z->ne[0]; - int64_t H = z->ne[1]; + int64_t W = z->ne[0]; + int64_t H = z->ne[1]; struct ggml_tensor* result_img = NULL; struct ggml_cplan cplan; @@ -4507,10 +4507,10 @@ class StableDiffusionGGML { size_t ctx_size = 10 * 1024 * 1024; // 10MB { struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = true; - params.dynamic = dynamic; + params.no_alloc = true; + params.dynamic = dynamic; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -4522,7 +4522,7 @@ class StableDiffusionGGML { ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx); struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, img); - cplan = ggml_graph_plan(vae_graph, n_threads); + cplan = ggml_graph_plan(vae_graph, n_threads); ctx_size += cplan.work_size; LOG_DEBUG("vae context need %.2fMB static memory, with work_size needing %.2fMB", @@ -4534,10 +4534,10 @@ class StableDiffusionGGML { { struct ggml_init_params params; - params.mem_size = ctx_size; + params.mem_size = ctx_size; params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = dynamic; + params.no_alloc = false; + params.dynamic = dynamic; struct ggml_context* ctx = ggml_init(params); if (!ctx) { @@ -4548,7 +4548,7 @@ class StableDiffusionGGML { struct ggml_tensor* img = first_stage_model.decode(ctx, z); struct ggml_cgraph* vae_graph = ggml_new_graph(ctx); - vae_graph->order = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT; + vae_graph->order = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT; ggml_build_forward_expand(vae_graph, img); int64_t t0 = ggml_time_ms(); @@ -4621,9 +4621,9 @@ std::vector StableDiffusion::txt2img(std::string prompt, struct ggml_init_params params; params.mem_size = static_cast(10 * 1024) * 1024; // 10M params.mem_size += width * height * 3 * sizeof(float) * 2; - params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = false; + params.mem_buffer = NULL; + params.no_alloc = false; + params.dynamic = false; struct ggml_context* ctx = ggml_init(params); if (!ctx) { LOG_ERROR("ggml_init() failed"); @@ -4636,7 +4636,7 @@ std::vector StableDiffusion::txt2img(std::string prompt, sd->rng->manual_seed(seed); // extract and remote lora - auto result_pair = extract_and_remove_lora(prompt); + auto result_pair = extract_and_remove_lora(prompt); std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier for (auto& kv : lora_f2m) { LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); @@ -4650,8 +4650,8 @@ std::vector StableDiffusion::txt2img(std::string prompt, int64_t t1 = ggml_time_ms(); LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - t0 = ggml_time_ms(); - ggml_tensor* c = sd->get_learned_condition(ctx, prompt); + t0 = ggml_time_ms(); + ggml_tensor* c = sd->get_learned_condition(ctx, prompt); struct ggml_tensor* uc = NULL; if (cfg_scale != 1.0) { uc = sd->get_learned_condition(ctx, negative_prompt); @@ -4665,9 +4665,9 @@ std::vector StableDiffusion::txt2img(std::string prompt, sd->clip_params_ctx = NULL; } - int C = 4; - int W = width / 8; - int H = height / 8; + int C = 4; + int W = width / 8; + int H = height / 8; struct ggml_tensor* x_t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, W, H, C, 1); ggml_tensor_set_f32_randn(x_t, sd->rng); @@ -4728,7 +4728,7 @@ std::vector StableDiffusion::img2img(const std::vector& init_i LOG_INFO("img2img %dx%d", width, height); std::vector sigmas = sd->denoiser->schedule->get_sigmas(sample_steps); - size_t t_enc = static_cast(sample_steps * strength); + size_t t_enc = static_cast(sample_steps * strength); LOG_INFO("target t_enc is %zu steps", t_enc); std::vector sigma_sched; sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); @@ -4736,9 +4736,9 @@ std::vector StableDiffusion::img2img(const std::vector& init_i struct ggml_init_params params; params.mem_size = static_cast(10 * 1024) * 1024; // 10M params.mem_size += width * height * 3 * sizeof(float) * 2; - params.mem_buffer = NULL; - params.no_alloc = false; - params.dynamic = false; + params.mem_buffer = NULL; + params.no_alloc = false; + params.dynamic = false; struct ggml_context* ctx = ggml_init(params); if (!ctx) { LOG_ERROR("ggml_init() failed"); @@ -4751,7 +4751,7 @@ std::vector StableDiffusion::img2img(const std::vector& init_i sd->rng->manual_seed(seed); // extract and remote lora - auto result_pair = extract_and_remove_lora(prompt); + auto result_pair = extract_and_remove_lora(prompt); std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier for (auto& kv : lora_f2m) { LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); @@ -4768,8 +4768,8 @@ std::vector StableDiffusion::img2img(const std::vector& init_i ggml_tensor* init_img = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, width, height, 3, 1); image_vec_to_ggml(init_img_vec, init_img); - t0 = ggml_time_ms(); - ggml_tensor* moments = sd->encode_first_stage(ctx, init_img); + t0 = ggml_time_ms(); + ggml_tensor* moments = sd->encode_first_stage(ctx, init_img); ggml_tensor* init_latent = sd->get_first_stage_encoding(ctx, moments); // print_ggml_tensor(init_latent); t1 = ggml_time_ms(); @@ -4777,7 +4777,7 @@ std::vector StableDiffusion::img2img(const std::vector& init_i ggml_reset_curr_max_dynamic_size(); // reset counter - ggml_tensor* c = sd->get_learned_condition(ctx, prompt); + ggml_tensor* c = sd->get_learned_condition(ctx, prompt); struct ggml_tensor* uc = NULL; if (cfg_scale != 1.0) { uc = sd->get_learned_condition(ctx, negative_prompt); diff --git a/stable-diffusion.h b/stable-diffusion.h index ed8cd1f..84d261e 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -38,15 +38,15 @@ enum Schedule { class StableDiffusionGGML; class StableDiffusion { - private: +private: std::shared_ptr sd; - public: - StableDiffusion(int n_threads = -1, - bool vae_decode_only = false, +public: + StableDiffusion(int n_threads = -1, + bool vae_decode_only = false, bool free_params_immediately = false, - std::string lora_model_dir = "", - RNGType rng_type = STD_DEFAULT_RNG); + std::string lora_model_dir = "", + RNGType rng_type = STD_DEFAULT_RNG); bool load_from_file(const std::string& file_path, Schedule d = DEFAULT); std::vector txt2img( std::string prompt,