diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..58d1885
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,13 @@
+BasedOnStyle: Chromium
+UseTab: Never
+IndentWidth: 4
+TabWidth: 4
+AllowShortIfStatementsOnASingleLine: false
+IndentCaseLabels: false
+ColumnLimit: 0
+AccessModifierOffset: -4
+NamespaceIndentation: All
+FixNamespaceComments: false 
+AlignAfterOpenBracket: true
+AlignConsecutiveAssignments: true
+IndentCaseLabels: true
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 59a8a2c..fd8ae08 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ test/
 
 .cache/
 *.swp
+.vscode/
diff --git a/examples/main.cpp b/examples/main.cpp
index b97035a..f4d6ca8 100644
--- a/examples/main.cpp
+++ b/examples/main.cpp
@@ -88,11 +88,10 @@ const char* sample_method_str[] = {
 const char* schedule_str[] = {
     "default",
     "discrete",
-    "karras"
-};
+    "karras"};
 
 struct Option {
-    int n_threads = -1;
+    int n_threads    = -1;
     std::string mode = TXT2IMG;
     std::string model_path;
     std::string lora_model_dir;
@@ -100,16 +99,16 @@ struct Option {
     std::string init_img;
     std::string prompt;
     std::string negative_prompt;
-    float cfg_scale = 7.0f;
-    int w = 512;
-    int h = 512;
+    float cfg_scale            = 7.0f;
+    int w                      = 512;
+    int h                      = 512;
     SampleMethod sample_method = EULER_A;
-    Schedule schedule = DEFAULT;
-    int sample_steps = 20;
-    float strength = 0.75f;
-    RNGType rng_type = CUDA_RNG;
-    int64_t seed = 42;
-    bool verbose = false;
+    Schedule schedule          = DEFAULT;
+    int sample_steps           = 20;
+    float strength             = 0.75f;
+    RNGType rng_type           = CUDA_RNG;
+    int64_t seed               = 42;
+    bool verbose               = false;
 
     void print() {
         printf("Option: \n");
@@ -129,7 +128,7 @@ struct Option {
         printf("    sample_steps:    %d\n", sample_steps);
         printf("    strength:        %.2f\n", strength);
         printf("    rng:             %s\n", rng_type_to_str[rng_type]);
-        printf("    seed:            %lld\n", seed);
+        printf("    seed:            %ld\n", seed);
     }
 };
 
@@ -266,7 +265,7 @@ void parse_args(int argc, const char* argv[], Option* opt) {
                 break;
             }
             const char* schedule_selected = argv[i];
-            int schedule_found = -1;
+            int schedule_found            = -1;
             for (int d = 0; d < N_SCHEDULES; d++) {
                 if (!strcmp(schedule_selected, schedule_str[d])) {
                     schedule_found = d;
@@ -289,7 +288,7 @@ void parse_args(int argc, const char* argv[], Option* opt) {
                 break;
             }
             const char* sample_method_selected = argv[i];
-            int sample_method_found = -1;
+            int sample_method_found            = -1;
             for (int m = 0; m < N_SAMPLE_METHODS; m++) {
                 if (!strcmp(sample_method_selected, sample_method_str[m])) {
                     sample_method_found = m;
@@ -405,6 +404,7 @@ int main(int argc, const char* argv[]) {
         vae_decode_only = false;
 
         int c = 0;
+
         unsigned char* img_data = stbi_load(opt.init_img.c_str(), &opt.w, &opt.h, &c, 3);
         if (img_data == NULL) {
             fprintf(stderr, "load image from '%s' failed\n", opt.init_img.c_str());
diff --git a/rng.h b/rng.h
index 73d998b..3340be6 100644
--- a/rng.h
+++ b/rng.h
@@ -5,23 +5,23 @@
 #include <vector>
 
 class RNG {
-   public:
-    virtual void manual_seed(uint64_t seed) = 0;
+public:
+    virtual void manual_seed(uint64_t seed)      = 0;
     virtual std::vector<float> randn(uint32_t n) = 0;
 };
 
 class STDDefaultRNG : public RNG {
-   private:
+private:
     std::default_random_engine generator;
 
-   public:
+public:
     void manual_seed(uint64_t seed) {
         generator.seed((unsigned int)seed);
     }
 
     std::vector<float> randn(uint32_t n) {
         std::vector<float> result;
-        float mean = 0.0;
+        float mean   = 0.0;
         float stddev = 1.0;
         std::normal_distribution<float> distribution(mean, stddev);
         for (uint32_t i = 0; i < n; i++) {
diff --git a/rng_philox.h b/rng_philox.h
index e334768..5672b5a 100644
--- a/rng_philox.h
+++ b/rng_philox.h
@@ -9,15 +9,15 @@
 // RNG imitiating torch cuda randn on CPU.
 // Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
 class PhiloxRNG : public RNG {
-   private:
+private:
     uint64_t seed;
     uint32_t offset;
 
-   private:
+private:
     std::vector<uint32_t> philox_m = {0xD2511F53, 0xCD9E8D57};
     std::vector<uint32_t> philox_w = {0x9E3779B9, 0xBB67AE85};
-    float two_pow32_inv = 2.3283064e-10f;
-    float two_pow32_inv_2pi = 2.3283064e-10f * 6.2831855f;
+    float two_pow32_inv            = 2.3283064e-10f;
+    float two_pow32_inv_2pi        = 2.3283064e-10f * 6.2831855f;
 
     std::vector<uint32_t> uint32(uint64_t x) {
         std::vector<uint32_t> result(2);
@@ -87,14 +87,14 @@ class PhiloxRNG : public RNG {
         return r1;
     }
 
-   public:
+public:
     PhiloxRNG(uint64_t seed = 0) {
-        this->seed = seed;
+        this->seed   = seed;
         this->offset = 0;
     }
 
     void manual_seed(uint64_t seed) {
-        this->seed = seed;
+        this->seed   = seed;
         this->offset = 0;
     }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 8339501..1a0b05d 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -108,7 +108,7 @@ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_pa
     }
 
     int32_t nelements = 1;
-    int32_t ne[4] = {1, 1, 1, 1};
+    int32_t ne[4]     = {1, 1, 1, 1};
     for (int i = 0; i < n_dims; ++i) {
         file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
         nelements *= ne[i];
@@ -116,13 +116,13 @@ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_pa
     std::string name(length, 0);
     file.read(&name[0], length);
     ggml_tensor* tensor = ggml_new_tensor_4d(ctx, (ggml_type)ttype, ne[0], ne[1], ne[2], ne[3]);
-    const size_t bpe = ggml_type_size(ggml_type(ttype));
+    const size_t bpe    = ggml_type_size(ggml_type(ttype));
     file.read(reinterpret_cast<char*>(tensor->data), ggml_nbytes(tensor));
     return tensor;
 }
 
 void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
-    uint32_t n = (uint32_t)ggml_nelements(tensor);
+    uint32_t n                        = (uint32_t)ggml_nelements(tensor);
     std::vector<float> random_numbers = rng->randn(n);
     for (uint32_t i = 0; i < n; i++) {
         ggml_set_f32_1d(tensor, i, random_numbers[i]);
@@ -231,7 +231,7 @@ std::vector<uint8_t> ggml_to_image_vec(struct ggml_tensor* t) {
         for (int j = 0; j < w; j++) {
             for (int k = 0; k < c; k++) {
                 float value = ggml_tensor_get_f32(t, j, i, k);
-                value = (value + 1.0f) * 0.5f;
+                value       = (value + 1.0f) * 0.5f;
                 if (value < 0) {
                     value = 0;
                 } else if (value > 1) {
@@ -247,16 +247,16 @@ std::vector<uint8_t> ggml_to_image_vec(struct ggml_tensor* t) {
 
 void image_vec_to_ggml(const std::vector<uint8_t>& vec,
                        struct ggml_tensor* t) {
-    int64_t w = t->ne[0];
-    int64_t h = t->ne[1];
-    int64_t c = t->ne[2];
+    int64_t w     = t->ne[0];
+    int64_t h     = t->ne[1];
+    int64_t c     = t->ne[2];
     uint8_t* data = (uint8_t*)vec.data();
     for (int i = 0; i < h; i++) {
         for (int j = 0; j < w; j++) {
             for (int k = 0; k < c; k++) {
                 float value = *(data + i * w * c + j * c + k);
-                value = value / 255.f;
-                value = 2 * value - 1;
+                value       = value / 255.f;
+                value       = 2 * value - 1;
                 ggml_tensor_set_f32(t, value, j, i, k);
             }
         }
@@ -275,7 +275,7 @@ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remov
 
     while (std::regex_search(text, matches, re)) {
         std::string filename = matches[1].str();
-        float multiplier = std::stof(matches[2].str());
+        float multiplier     = std::stof(matches[2].str());
         if (multiplier < 0.f) {
             continue;
         }
@@ -322,14 +322,14 @@ const int PAD_TOKEN_ID = 49407;
 // Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
 // TODO: implement bpe
 class CLIPTokenizer {
-   private:
+private:
     ModelType model_type = SD1;
     std::map<std::string, int32_t> encoder;
     std::regex pat;
 
     static std::string strip(const std::string& str) {
         std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
-        std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f");
+        std::string::size_type end   = str.find_last_not_of(" \t\n\r\v\f");
 
         if (start == std::string::npos) {
             // String contains only whitespace characters
@@ -345,7 +345,7 @@ class CLIPTokenizer {
         return text;
     }
 
-   public:
+public:
     CLIPTokenizer(ModelType model_type = SD1)
         : model_type(model_type){};
     std::string bpe(std::string token) {
@@ -457,7 +457,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
     std::vector<int> round_brackets;
     std::vector<int> square_brackets;
 
-    float round_bracket_multiplier = 1.1f;
+    float round_bracket_multiplier  = 1.1f;
     float square_bracket_multiplier = 1 / 1.1f;
 
     std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
@@ -473,7 +473,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
     std::string remaining_text = text;
 
     while (std::regex_search(remaining_text, m, re_attention)) {
-        std::string text = m[0];
+        std::string text   = m[0];
         std::string weight = m[1];
 
         if (text == "(") {
@@ -595,32 +595,32 @@ struct ResidualAttentionBlock {
     }
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        tensors[prefix + "self_attn.q_proj.weight"] = q_w;
-        tensors[prefix + "self_attn.q_proj.bias"] = q_b;
-        tensors[prefix + "self_attn.k_proj.weight"] = k_w;
-        tensors[prefix + "self_attn.k_proj.bias"] = k_b;
-        tensors[prefix + "self_attn.v_proj.weight"] = v_w;
-        tensors[prefix + "self_attn.v_proj.bias"] = v_b;
+        tensors[prefix + "self_attn.q_proj.weight"]   = q_w;
+        tensors[prefix + "self_attn.q_proj.bias"]     = q_b;
+        tensors[prefix + "self_attn.k_proj.weight"]   = k_w;
+        tensors[prefix + "self_attn.k_proj.bias"]     = k_b;
+        tensors[prefix + "self_attn.v_proj.weight"]   = v_w;
+        tensors[prefix + "self_attn.v_proj.bias"]     = v_b;
         tensors[prefix + "self_attn.out_proj.weight"] = out_w;
-        tensors[prefix + "self_attn.out_proj.bias"] = out_b;
+        tensors[prefix + "self_attn.out_proj.bias"]   = out_b;
 
         tensors[prefix + "layer_norm1.weight"] = ln1_w;
-        tensors[prefix + "layer_norm1.bias"] = ln1_b;
+        tensors[prefix + "layer_norm1.bias"]   = ln1_b;
 
         tensors[prefix + "layer_norm2.weight"] = ln2_w;
-        tensors[prefix + "layer_norm2.bias"] = ln2_b;
+        tensors[prefix + "layer_norm2.bias"]   = ln2_b;
 
         tensors[prefix + "mlp.fc1.weight"] = fc1_w;
-        tensors[prefix + "mlp.fc1.bias"] = fc1_b;
+        tensors[prefix + "mlp.fc1.bias"]   = fc1_b;
 
         tensors[prefix + "mlp.fc2.weight"] = fc2_w;
-        tensors[prefix + "mlp.fc2.bias"] = fc2_b;
+        tensors[prefix + "mlp.fc2.bias"]   = fc2_b;
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
         // x: [N, n_token, hidden_size]
-        int64_t N = x->ne[2];
-        int64_t n_token = x->ne[1];
+        int64_t N           = x->ne[2];
+        int64_t n_token     = x->ne[1];
         int64_t hidden_size = n_head * d_model;
 
         struct ggml_tensor* r = x;
@@ -637,24 +637,24 @@ struct ResidualAttentionBlock {
             struct ggml_tensor* q = ggml_add(ctx,
                                              ggml_repeat(ctx, q_b, x),
                                              ggml_mul_mat(ctx, q_w, x));
-            q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_model)));
-            q = ggml_reshape_4d(ctx, q, d_model, n_head, n_token, N);   // [N, n_token, n_head, d_model]
-            q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));       // [N, n_head, n_token, d_model]
-            q = ggml_reshape_3d(ctx, q, d_model, n_token, n_head * N);  // [N * n_head, n_token, d_model]
+            q                     = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_model)));
+            q                     = ggml_reshape_4d(ctx, q, d_model, n_head, n_token, N);   // [N, n_token, n_head, d_model]
+            q                     = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));       // [N, n_head, n_token, d_model]
+            q                     = ggml_reshape_3d(ctx, q, d_model, n_token, n_head * N);  // [N * n_head, n_token, d_model]
 
             struct ggml_tensor* k = ggml_add(ctx,
                                              ggml_repeat(ctx, k_b, x),
                                              ggml_mul_mat(ctx, k_w, x));
-            k = ggml_reshape_4d(ctx, k, d_model, n_head, n_token, N);  // [N, n_token, n_head, d_model]
-            k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));      // [N, n_head, n_token, d_model]
-            k = ggml_reshape_3d(ctx, k, d_model, n_token, n_head);     // [N * n_head, n_token, d_model]
+            k                     = ggml_reshape_4d(ctx, k, d_model, n_head, n_token, N);  // [N, n_token, n_head, d_model]
+            k                     = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));      // [N, n_head, n_token, d_model]
+            k                     = ggml_reshape_3d(ctx, k, d_model, n_token, n_head);     // [N * n_head, n_token, d_model]
 
             struct ggml_tensor* v = ggml_add(ctx,
                                              ggml_repeat(ctx, v_b, x),
                                              ggml_mul_mat(ctx, v_w, x));
-            v = ggml_reshape_4d(ctx, v, d_model, n_head, n_token, N);   // [N, n_token, n_head, d_model]
-            v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));       // [N, n_head, d_model, n_token]
-            v = ggml_reshape_3d(ctx, v, n_token, d_model, n_head * N);  // [N * n_head, d_model, n_token]
+            v                     = ggml_reshape_4d(ctx, v, d_model, n_head, n_token, N);   // [N, n_token, n_head, d_model]
+            v                     = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));       // [N, n_head, d_model, n_token]
+            v                     = ggml_reshape_3d(ctx, v, n_token, d_model, n_head * N);  // [N * n_head, d_model, n_token]
 
             struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, n_token, n_token]
 
@@ -662,8 +662,8 @@ struct ResidualAttentionBlock {
             kq = ggml_soft_max_inplace(ctx, kq);
 
             struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, n_token, d_model]
-            kqv = ggml_reshape_4d(ctx, kqv, d_model, n_token, n_head, N);
-            kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_model]
+            kqv                     = ggml_reshape_4d(ctx, kqv, d_model, n_token, n_head, N);
+            kqv                     = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_model]
 
             x = ggml_reshape_2d(ctx, kqv, d_model * n_head, n_token * N);  // // [N * n_token, d_model * n_head]
         }
@@ -708,12 +708,12 @@ struct ResidualAttentionBlock {
 struct CLIPTextModel {
     ModelType model_type = SD1;
     // network hparams
-    int32_t vocab_size = 49408;
+    int32_t vocab_size              = 49408;
     int32_t max_position_embeddings = 77;
-    int32_t hidden_size = 768;         // 1024 for SD 2.x
-    int32_t intermediate_size = 3072;  // 4096 for SD 2.x
-    int32_t n_head = 12;               // num_attention_heads, 16 for SD 2.x
-    int32_t num_hidden_layers = 12;    // 24 for SD 2.x
+    int32_t hidden_size             = 768;   // 1024 for SD 2.x
+    int32_t intermediate_size       = 3072;  // 4096 for SD 2.x
+    int32_t n_head                  = 12;    // num_attention_heads, 16 for SD 2.x
+    int32_t num_hidden_layers       = 12;    // 24 for SD 2.x
 
     // embeddings
     struct ggml_tensor* position_ids;
@@ -727,9 +727,9 @@ struct CLIPTextModel {
     CLIPTextModel(ModelType model_type = SD1)
         : model_type(model_type) {
         if (model_type == SD2) {
-            hidden_size = 1024;
+            hidden_size       = 1024;
             intermediate_size = 4096;
-            n_head = 16;
+            n_head            = 16;
             num_hidden_layers = 24;
         }
         resblocks.resize(num_hidden_layers);
@@ -739,9 +739,9 @@ struct CLIPTextModel {
     void set_resblocks_hp_params() {
         int d_model = hidden_size / n_head;  // 64
         for (int i = 0; i < num_hidden_layers; i++) {
-            resblocks[i].d_model = d_model;
-            resblocks[i].n_head = n_head;
-            resblocks[i].hidden_size = hidden_size;
+            resblocks[i].d_model           = d_model;
+            resblocks[i].n_head            = n_head;
+            resblocks[i].hidden_size       = hidden_size;
             resblocks[i].intermediate_size = intermediate_size;
         }
     }
@@ -764,7 +764,7 @@ struct CLIPTextModel {
         for (int i = 0; i < max_position_embeddings; i++) {
             ggml_set_i32_1d(position_ids, i, i);
         }
-        token_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, vocab_size);
+        token_embed_weight    = ggml_new_tensor_2d(ctx, wtype, hidden_size, vocab_size);
         position_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, max_position_embeddings);
 
         for (int i = 0; i < num_hidden_layers; i++) {
@@ -776,10 +776,10 @@ struct CLIPTextModel {
     }
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        tensors[prefix + "embeddings.token_embedding.weight"] = token_embed_weight;
+        tensors[prefix + "embeddings.token_embedding.weight"]    = token_embed_weight;
         tensors[prefix + "embeddings.position_embedding.weight"] = position_embed_weight;
-        tensors[prefix + "final_layer_norm.weight"] = final_ln_w;
-        tensors[prefix + "final_layer_norm.bias"] = final_ln_b;
+        tensors[prefix + "final_layer_norm.weight"]              = final_ln_w;
+        tensors[prefix + "final_layer_norm.bias"]                = final_ln_b;
         for (int i = 0; i < num_hidden_layers; i++) {
             resblocks[i].map_by_name(tensors, prefix + "encoder.layers." + std::to_string(i) + ".");
         }
@@ -822,7 +822,7 @@ struct FrozenCLIPEmbedder {
     CLIPTokenizer tokenizer;
     CLIPTextModel text_model;
     struct ggml_tensor* forward(struct ggml_context* ctx, const std::string& prompt) {
-        std::vector<int32_t> tokens = tokenizer.tokenize(prompt, text_model.max_position_embeddings, true);
+        std::vector<int32_t> tokens   = tokenizer.tokenize(prompt, text_model.max_position_embeddings, true);
         struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens.size());
         memcpy(input_ids->data, tokens.data(), tokens.size() * ggml_element_size(input_ids));
         struct ggml_tensor* hidden_states = text_model.forward(ctx, input_ids);
@@ -841,7 +841,7 @@ struct FrozenCLIPEmbedderWithCustomWords {
 
     std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
                                                              size_t max_length = 0,
-                                                             bool padding = false) {
+                                                             bool padding      = false) {
         auto parsed_attention = parse_prompt_attention(text);
 
         {
@@ -858,7 +858,7 @@ struct FrozenCLIPEmbedderWithCustomWords {
         std::vector<float> weights;
         for (const auto& item : parsed_attention) {
             const std::string& curr_text = item.first;
-            float curr_weight = item.second;
+            float curr_weight            = item.second;
             std::vector<int> curr_tokens = tokenizer.encode(curr_text);
             tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
             weights.insert(weights.end(), curr_tokens.size(), curr_weight);
@@ -969,21 +969,21 @@ struct ResBlock {
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         tensors[prefix + "in_layers.0.weight"] = in_layer_0_w;
-        tensors[prefix + "in_layers.0.bias"] = in_layer_0_b;
+        tensors[prefix + "in_layers.0.bias"]   = in_layer_0_b;
         tensors[prefix + "in_layers.2.weight"] = in_layer_2_w;
-        tensors[prefix + "in_layers.2.bias"] = in_layer_2_b;
+        tensors[prefix + "in_layers.2.bias"]   = in_layer_2_b;
 
         tensors[prefix + "emb_layers.1.weight"] = emb_layer_1_w;
-        tensors[prefix + "emb_layers.1.bias"] = emb_layer_1_b;
+        tensors[prefix + "emb_layers.1.bias"]   = emb_layer_1_b;
 
         tensors[prefix + "out_layers.0.weight"] = out_layer_0_w;
-        tensors[prefix + "out_layers.0.bias"] = out_layer_0_b;
+        tensors[prefix + "out_layers.0.bias"]   = out_layer_0_b;
         tensors[prefix + "out_layers.3.weight"] = out_layer_3_w;
-        tensors[prefix + "out_layers.3.bias"] = out_layer_3_b;
+        tensors[prefix + "out_layers.3.bias"]   = out_layer_3_b;
 
         if (out_channels != channels) {
             tensors[prefix + "skip_connection.weight"] = skip_w;
-            tensors[prefix + "skip_connection.bias"] = skip_b;
+            tensors[prefix + "skip_connection.bias"]   = skip_b;
         }
     }
 
@@ -994,15 +994,15 @@ struct ResBlock {
         // in_layers
         // group norm 32
         auto h = ggml_group_norm_32(ctx, x);
-        h = ggml_add(ctx,
-                     ggml_mul(ctx,
-                              ggml_repeat(ctx,
-                                          ggml_reshape_4d(ctx, in_layer_0_w, 1, 1, in_layer_0_w->ne[0], 1),
-                                          h),
-                              h),
-                     ggml_repeat(ctx,
-                                 ggml_reshape_4d(ctx, in_layer_0_b, 1, 1, in_layer_0_b->ne[0], 1),
-                                 h));
+        h      = ggml_add(ctx,
+                          ggml_mul(ctx,
+                                   ggml_repeat(ctx,
+                                               ggml_reshape_4d(ctx, in_layer_0_w, 1, 1, in_layer_0_w->ne[0], 1),
+                                               h),
+                                   h),
+                          ggml_repeat(ctx,
+                                      ggml_reshape_4d(ctx, in_layer_0_b, 1, 1, in_layer_0_b->ne[0], 1),
+                                      h));
         // silu
         h = ggml_silu_inplace(ctx, h);
         // conv2d
@@ -1015,10 +1015,10 @@ struct ResBlock {
 
         // emb_layers
         auto emb_out = ggml_silu(ctx, emb);
-        emb_out = ggml_mul_mat(ctx, emb_layer_1_w, emb_out);
-        emb_out = ggml_add(ctx, ggml_repeat(ctx, emb_layer_1_b, emb_out), emb_out);     // [N, out_channels]
-        emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
-        emb_out = ggml_repeat(ctx, emb_out, h);                                         // [N, out_channels, h, w]
+        emb_out      = ggml_mul_mat(ctx, emb_layer_1_w, emb_out);
+        emb_out      = ggml_add(ctx, ggml_repeat(ctx, emb_layer_1_b, emb_out), emb_out);     // [N, out_channels]
+        emb_out      = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
+        emb_out      = ggml_repeat(ctx, emb_out, h);                                         // [N, out_channels, h, w]
 
         // out_layers
         h = ggml_add(ctx, h, emb_out);
@@ -1056,7 +1056,7 @@ struct SpatialTransformer {
     int in_channels;        // mult * model_channels
     int n_head;             // num_heads
     int d_head;             // in_channels // n_heads
-    int depth = 1;          // 1
+    int depth       = 1;    // 1
     int context_dim = 768;  // hidden_size, 1024 for SD2.x
 
     // group norm
@@ -1131,8 +1131,8 @@ struct SpatialTransformer {
     }
 
     void init_params(struct ggml_context* ctx, ggml_type wtype) {
-        norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
-        norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        norm_w    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        norm_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
         proj_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
         proj_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
 
@@ -1171,43 +1171,43 @@ struct SpatialTransformer {
     }
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        tensors[prefix + "norm.weight"] = norm_w;
-        tensors[prefix + "norm.bias"] = norm_b;
+        tensors[prefix + "norm.weight"]    = norm_w;
+        tensors[prefix + "norm.bias"]      = norm_b;
         tensors[prefix + "proj_in.weight"] = proj_in_w;
-        tensors[prefix + "proj_in.bias"] = proj_in_b;
+        tensors[prefix + "proj_in.bias"]   = proj_in_b;
 
         // transformer
         {
-            std::string transformer_prefix = prefix + "transformer_blocks.0.";
+            std::string transformer_prefix                    = prefix + "transformer_blocks.0.";
             tensors[transformer_prefix + "attn1.to_q.weight"] = transformer.attn1_q_w;
             tensors[transformer_prefix + "attn1.to_k.weight"] = transformer.attn1_k_w;
             tensors[transformer_prefix + "attn1.to_v.weight"] = transformer.attn1_v_w;
 
             tensors[transformer_prefix + "attn1.to_out.0.weight"] = transformer.attn1_out_w;
-            tensors[transformer_prefix + "attn1.to_out.0.bias"] = transformer.attn1_out_b;
+            tensors[transformer_prefix + "attn1.to_out.0.bias"]   = transformer.attn1_out_b;
 
             tensors[transformer_prefix + "ff.net.0.proj.weight"] = transformer.ff_0_proj_w;
-            tensors[transformer_prefix + "ff.net.0.proj.bias"] = transformer.ff_0_proj_b;
-            tensors[transformer_prefix + "ff.net.2.weight"] = transformer.ff_2_w;
-            tensors[transformer_prefix + "ff.net.2.bias"] = transformer.ff_2_b;
+            tensors[transformer_prefix + "ff.net.0.proj.bias"]   = transformer.ff_0_proj_b;
+            tensors[transformer_prefix + "ff.net.2.weight"]      = transformer.ff_2_w;
+            tensors[transformer_prefix + "ff.net.2.bias"]        = transformer.ff_2_b;
 
             tensors[transformer_prefix + "attn2.to_q.weight"] = transformer.attn2_q_w;
             tensors[transformer_prefix + "attn2.to_k.weight"] = transformer.attn2_k_w;
             tensors[transformer_prefix + "attn2.to_v.weight"] = transformer.attn2_v_w;
 
             tensors[transformer_prefix + "attn2.to_out.0.weight"] = transformer.attn2_out_w;
-            tensors[transformer_prefix + "attn2.to_out.0.bias"] = transformer.attn2_out_b;
+            tensors[transformer_prefix + "attn2.to_out.0.bias"]   = transformer.attn2_out_b;
 
             tensors[transformer_prefix + "norm1.weight"] = transformer.norm1_w;
-            tensors[transformer_prefix + "norm1.bias"] = transformer.norm1_b;
+            tensors[transformer_prefix + "norm1.bias"]   = transformer.norm1_b;
             tensors[transformer_prefix + "norm2.weight"] = transformer.norm2_w;
-            tensors[transformer_prefix + "norm2.bias"] = transformer.norm2_b;
+            tensors[transformer_prefix + "norm2.bias"]   = transformer.norm2_b;
             tensors[transformer_prefix + "norm3.weight"] = transformer.norm3_w;
-            tensors[transformer_prefix + "norm3.bias"] = transformer.norm3_b;
+            tensors[transformer_prefix + "norm3.bias"]   = transformer.norm3_b;
         }
 
         tensors[prefix + "proj_out.weight"] = proj_out_w;
-        tensors[prefix + "proj_out.bias"] = proj_out_b;
+        tensors[prefix + "proj_out.bias"]   = proj_out_b;
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
@@ -1229,12 +1229,12 @@ struct SpatialTransformer {
                                  x));  // [N, in_channels, h, w]
 
         // transformer
-        const int64_t n = x->ne[3];
-        const int64_t c = x->ne[2];
-        const int64_t h = x->ne[1];
-        const int64_t w = x->ne[0];
+        const int64_t n            = x->ne[3];
+        const int64_t c            = x->ne[2];
+        const int64_t h            = x->ne[1];
+        const int64_t w            = x->ne[0];
         const int64_t max_position = context->ne[1];
-        x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, in_channels]
+        x                          = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3));  // [N, h, w, in_channels]
 
         {
             auto r = x;
@@ -1251,30 +1251,30 @@ struct SpatialTransformer {
 
             // self-attention
             {
-                x = ggml_reshape_2d(ctx, x, c, h * w * n);                            // [N * h * w, in_channels]
+                x                     = ggml_reshape_2d(ctx, x, c, h * w * n);        // [N * h * w, in_channels]
                 struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn1_q_w, x);  // [N * h * w, in_channels]
-                q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_head)));
-                q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n);   // [N, h * w, n_head, d_head]
-                q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));    // [N, n_head, h * w, d_head]
-                q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n);  // [N * n_head, h * w, d_head]
+                q                     = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_head)));
+                q                     = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n);   // [N, h * w, n_head, d_head]
+                q                     = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));    // [N, n_head, h * w, d_head]
+                q                     = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n);  // [N * n_head, h * w, d_head]
 
-                struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn1_k_w, x);  // [N * h * w, in_channels]
-                k = ggml_reshape_4d(ctx, k, d_head, n_head, h * w, n);                // [N, h * w, n_head, d_head]
-                k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));                 // [N, n_head, h * w, d_head]
-                k = ggml_reshape_3d(ctx, k, d_head, h * w, n_head * n);               // [N * n_head, h * w, d_head]
+                struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn1_k_w, x);         // [N * h * w, in_channels]
+                k                     = ggml_reshape_4d(ctx, k, d_head, n_head, h * w, n);   // [N, h * w, n_head, d_head]
+                k                     = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));    // [N, n_head, h * w, d_head]
+                k                     = ggml_reshape_3d(ctx, k, d_head, h * w, n_head * n);  // [N * n_head, h * w, d_head]
 
-                struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn1_v_w, x);  // [N * h * w, in_channels]
-                v = ggml_reshape_4d(ctx, v, d_head, n_head, h * w, n);                // [N, h * w, n_head, d_head]
-                v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));                 // [N, n_head, d_head, h * w]
-                v = ggml_reshape_3d(ctx, v, h * w, d_head, n_head * n);               // [N * n_head, d_head, h * w]
+                struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn1_v_w, x);         // [N * h * w, in_channels]
+                v                     = ggml_reshape_4d(ctx, v, d_head, n_head, h * w, n);   // [N, h * w, n_head, d_head]
+                v                     = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));    // [N, n_head, d_head, h * w]
+                v                     = ggml_reshape_3d(ctx, v, h * w, d_head, n_head * n);  // [N * n_head, d_head, h * w]
 
                 struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, h * w, h * w]
                 // kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
                 kq = ggml_soft_max_inplace(ctx, kq);
 
                 struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, h * w, d_head]
-                kqv = ggml_reshape_4d(ctx, kqv, d_head, h * w, n_head, n);
-                kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, h * w, n_head, d_head]
+                kqv                     = ggml_reshape_4d(ctx, kqv, d_head, h * w, n_head, n);
+                kqv                     = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, h * w, n_head, d_head]
 
                 // x = ggml_cpy(ctx, kqv, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d_head * n_head, h * w * n));
                 x = ggml_reshape_2d(ctx, kqv, d_head * n_head, h * w * n);
@@ -1298,24 +1298,24 @@ struct SpatialTransformer {
 
             // cross-attention
             {
-                x = ggml_reshape_2d(ctx, x, c, h * w * n);                                                 // [N * h * w, in_channels]
-                context = ggml_reshape_2d(ctx, context, context->ne[0], context->ne[1] * context->ne[2]);  // [N * max_position, hidden_size]
-                struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn2_q_w, x);                       // [N * h * w, in_channels]
+                x                     = ggml_reshape_2d(ctx, x, c, h * w * n);                                           // [N * h * w, in_channels]
+                context               = ggml_reshape_2d(ctx, context, context->ne[0], context->ne[1] * context->ne[2]);  // [N * max_position, hidden_size]
+                struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn2_q_w, x);                                     // [N * h * w, in_channels]
 
                 q = ggml_scale_inplace(ctx, q, ggml_new_f32(ctx, 1.0f / sqrt((float)d_head)));
                 q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n);   // [N, h * w, n_head, d_head]
                 q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));    // [N, n_head, h * w, d_head]
                 q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n);  // [N * n_head, h * w, d_head]
 
-                struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn2_k_w, context);  // [N * max_position, in_channels]
-                k = ggml_reshape_4d(ctx, k, d_head, n_head, max_position, n);               // [N, max_position, n_head, d_head]
-                k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));                       // [N, n_head, max_position, d_head]
-                k = ggml_reshape_3d(ctx, k, d_head, max_position, n_head * n);              // [N * n_head, max_position, d_head]
+                struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn2_k_w, context);          // [N * max_position, in_channels]
+                k                     = ggml_reshape_4d(ctx, k, d_head, n_head, max_position, n);   // [N, max_position, n_head, d_head]
+                k                     = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));           // [N, n_head, max_position, d_head]
+                k                     = ggml_reshape_3d(ctx, k, d_head, max_position, n_head * n);  // [N * n_head, max_position, d_head]
 
-                struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn2_v_w, context);  // [N * max_position, in_channels]
-                v = ggml_reshape_4d(ctx, v, d_head, n_head, max_position, n);               // [N, max_position, n_head, d_head]
-                v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));                       // [N, n_head, d_head, max_position]
-                v = ggml_reshape_3d(ctx, v, max_position, d_head, n_head * n);              // [N * n_head, d_head, max_position]
+                struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn2_v_w, context);          // [N * max_position, in_channels]
+                v                     = ggml_reshape_4d(ctx, v, d_head, n_head, max_position, n);   // [N, max_position, n_head, d_head]
+                v                     = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));           // [N, n_head, d_head, max_position]
+                v                     = ggml_reshape_3d(ctx, v, max_position, d_head, n_head * n);  // [N * n_head, d_head, max_position]
 
                 struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, h * w, max_position]
                 // kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
@@ -1350,16 +1350,16 @@ struct SpatialTransformer {
             // ff
             {
                 // GEGLU
-                auto x_w = ggml_view_2d(ctx,
-                                        transformer.ff_0_proj_w,
-                                        transformer.ff_0_proj_w->ne[0],
-                                        transformer.ff_0_proj_w->ne[1] / 2,
-                                        transformer.ff_0_proj_w->nb[1],
-                                        0);  // [in_channels * 4, in_channels]
-                auto x_b = ggml_view_1d(ctx,
-                                        transformer.ff_0_proj_b,
-                                        transformer.ff_0_proj_b->ne[0] / 2,
-                                        0);  // [in_channels * 4, in_channels]
+                auto x_w    = ggml_view_2d(ctx,
+                                           transformer.ff_0_proj_w,
+                                           transformer.ff_0_proj_w->ne[0],
+                                           transformer.ff_0_proj_w->ne[1] / 2,
+                                           transformer.ff_0_proj_w->nb[1],
+                                           0);  // [in_channels * 4, in_channels]
+                auto x_b    = ggml_view_1d(ctx,
+                                           transformer.ff_0_proj_b,
+                                           transformer.ff_0_proj_b->ne[0] / 2,
+                                           0);  // [in_channels * 4, in_channels]
                 auto gate_w = ggml_view_2d(ctx,
                                            transformer.ff_0_proj_w,
                                            transformer.ff_0_proj_w->ne[0],
@@ -1370,12 +1370,12 @@ struct SpatialTransformer {
                                            transformer.ff_0_proj_b,
                                            transformer.ff_0_proj_b->ne[0] / 2,
                                            transformer.ff_0_proj_b->nb[0] * transformer.ff_0_proj_b->ne[0] / 2);  // [in_channels * 4, ]
-                x = ggml_reshape_2d(ctx, x, c, w * h * n);
-                auto x_in = x;
-                x = ggml_mul_mat(ctx, x_w, x_in);  // [N * h * w, in_channels * 4]
-                x = ggml_add(ctx, ggml_repeat(ctx, x_b, x), x);
-                auto gate = ggml_mul_mat(ctx, gate_w, x_in);  // [N * h * w, in_channels * 4]
-                gate = ggml_add(ctx, ggml_repeat(ctx, gate_b, gate), gate);
+                x           = ggml_reshape_2d(ctx, x, c, w * h * n);
+                auto x_in   = x;
+                x           = ggml_mul_mat(ctx, x_w, x_in);  // [N * h * w, in_channels * 4]
+                x           = ggml_add(ctx, ggml_repeat(ctx, x_b, x), x);
+                auto gate   = ggml_mul_mat(ctx, gate_w, x_in);  // [N * h * w, in_channels * 4]
+                gate        = ggml_add(ctx, ggml_repeat(ctx, gate_b, gate), gate);
 
                 gate = ggml_gelu_inplace(ctx, gate);
 
@@ -1431,10 +1431,10 @@ struct DownSample {
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         if (vae_downsample) {
             tensors[prefix + "conv.weight"] = op_w;
-            tensors[prefix + "conv.bias"] = op_b;
+            tensors[prefix + "conv.bias"]   = op_b;
         } else {
             tensors[prefix + "op.weight"] = op_w;
-            tensors[prefix + "op.bias"] = op_b;
+            tensors[prefix + "op.bias"]   = op_b;
         }
     }
 
@@ -1513,7 +1513,7 @@ struct UpSample {
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         tensors[prefix + "conv.weight"] = conv_w;
-        tensors[prefix + "conv.bias"] = conv_b;
+        tensors[prefix + "conv.bias"]   = conv_b;
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -1533,16 +1533,16 @@ struct UpSample {
 // ldm.modules.diffusionmodules.openaimodel.UNetModel
 struct UNetModel {
     // network hparams
-    int in_channels = 4;
-    int model_channels = 320;
-    int out_channels = 4;
-    int num_res_blocks = 2;
+    int in_channels              = 4;
+    int model_channels           = 320;
+    int out_channels             = 4;
+    int num_res_blocks           = 2;
     int attention_resolutions[3] = {4, 2, 1};
-    int channel_mult[4] = {1, 2, 4, 4};
-    int time_embed_dim = 1280;  // model_channels*4
-    int num_heads = 8;
-    int num_head_channels = -1;  // channels // num_heads
-    int context_dim = 768;       // 1024 for SD2.x
+    int channel_mult[4]          = {1, 2, 4, 4};
+    int time_embed_dim           = 1280;  // model_channels*4
+    int num_heads                = 8;
+    int num_head_channels        = -1;   // channels // num_heads
+    int context_dim              = 768;  // 1024 for SD2.x
 
     // network params
     struct ggml_tensor* time_embed_0_w;  // [time_embed_dim, model_channels]
@@ -1579,9 +1579,9 @@ struct UNetModel {
 
     UNetModel(ModelType model_type = SD1) {
         if (model_type == SD2) {
-            context_dim = 1024;
+            context_dim       = 1024;
             num_head_channels = 64;
-            num_heads = -1;
+            num_heads         = -1;
         }
         // set up hparams of blocks
 
@@ -1595,7 +1595,7 @@ struct UNetModel {
         for (int i = 0; i < len_mults; i++) {
             int mult = channel_mult[i];
             for (int j = 0; j < num_res_blocks; j++) {
-                input_res_blocks[i][j].channels = ch;
+                input_res_blocks[i][j].channels     = ch;
                 input_res_blocks[i][j].emb_channels = time_embed_dim;
                 input_res_blocks[i][j].out_channels = mult * model_channels;
 
@@ -1609,14 +1609,14 @@ struct UNetModel {
                         n_head = ch / d_head;
                     }
                     input_transformers[i][j].in_channels = ch;
-                    input_transformers[i][j].n_head = n_head;
-                    input_transformers[i][j].d_head = d_head;
+                    input_transformers[i][j].n_head      = n_head;
+                    input_transformers[i][j].d_head      = d_head;
                     input_transformers[i][j].context_dim = context_dim;
                 }
                 input_block_chans.push_back(ch);
             }
             if (i != len_mults - 1) {
-                input_down_samples[i].channels = ch;
+                input_down_samples[i].channels     = ch;
                 input_down_samples[i].out_channels = ch;
                 input_block_chans.push_back(ch);
 
@@ -1625,7 +1625,7 @@ struct UNetModel {
         }
 
         // middle blocks
-        middle_block_0.channels = ch;
+        middle_block_0.channels     = ch;
         middle_block_0.emb_channels = time_embed_dim;
         middle_block_0.out_channels = ch;
 
@@ -1636,11 +1636,11 @@ struct UNetModel {
             n_head = ch / d_head;
         }
         middle_block_1.in_channels = ch;
-        middle_block_1.n_head = n_head;
-        middle_block_1.d_head = d_head;
+        middle_block_1.n_head      = n_head;
+        middle_block_1.d_head      = d_head;
         middle_block_1.context_dim = context_dim;
 
-        middle_block_2.channels = ch;
+        middle_block_2.channels     = ch;
         middle_block_2.emb_channels = time_embed_dim;
         middle_block_2.out_channels = ch;
 
@@ -1651,7 +1651,7 @@ struct UNetModel {
                 int ich = input_block_chans.back();
                 input_block_chans.pop_back();
 
-                output_res_blocks[i][j].channels = ch + ich;
+                output_res_blocks[i][j].channels     = ch + ich;
                 output_res_blocks[i][j].emb_channels = time_embed_dim;
                 output_res_blocks[i][j].out_channels = mult * model_channels;
 
@@ -1665,13 +1665,13 @@ struct UNetModel {
                         n_head = ch / d_head;
                     }
                     output_transformers[i][j].in_channels = ch;
-                    output_transformers[i][j].n_head = n_head;
-                    output_transformers[i][j].d_head = d_head;
+                    output_transformers[i][j].n_head      = n_head;
+                    output_transformers[i][j].d_head      = d_head;
                     output_transformers[i][j].context_dim = context_dim;
                 }
 
                 if (i > 0 && j == num_res_blocks) {
-                    output_up_samples[i - 1].channels = ch;
+                    output_up_samples[i - 1].channels     = ch;
                     output_up_samples[i - 1].out_channels = ch;
 
                     ds /= 2;
@@ -1693,7 +1693,7 @@ struct UNetModel {
         mem_size += 6 * ggml_tensor_overhead();  // object overhead
 
         // input_blocks
-        int ds = 1;
+        int ds        = 1;
         int len_mults = sizeof(channel_mult) / sizeof(int);
         for (int i = 0; i < len_mults; i++) {
             for (int j = 0; j < num_res_blocks; j++) {
@@ -1750,8 +1750,8 @@ struct UNetModel {
         // input_blocks
         input_block_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, model_channels);
         input_block_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels);
-        int ds = 1;
-        int len_mults = sizeof(channel_mult) / sizeof(int);
+        int ds          = 1;
+        int len_mults   = sizeof(channel_mult) / sizeof(int);
         for (int i = 0; i < len_mults; i++) {
             for (int j = 0; j < num_res_blocks; j++) {
                 input_res_blocks[i][j].init_params(ctx, wtype);
@@ -1797,18 +1797,18 @@ struct UNetModel {
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         tensors[prefix + "time_embed.0.weight"] = time_embed_0_w;
-        tensors[prefix + "time_embed.0.bias"] = time_embed_0_b;
+        tensors[prefix + "time_embed.0.bias"]   = time_embed_0_b;
 
         tensors[prefix + "time_embed.2.weight"] = time_embed_2_w;
-        tensors[prefix + "time_embed.2.bias"] = time_embed_2_b;
+        tensors[prefix + "time_embed.2.bias"]   = time_embed_2_b;
 
         // input_blocks
         tensors[prefix + "input_blocks.0.0.weight"] = input_block_0_w;
-        tensors[prefix + "input_blocks.0.0.bias"] = input_block_0_b;
+        tensors[prefix + "input_blocks.0.0.bias"]   = input_block_0_b;
 
-        int len_mults = sizeof(channel_mult) / sizeof(int);
+        int len_mults       = sizeof(channel_mult) / sizeof(int);
         int input_block_idx = 0;
-        int ds = 1;
+        int ds              = 1;
         for (int i = 0; i < len_mults; i++) {
             for (int j = 0; j < num_res_blocks; j++) {
                 input_block_idx += 1;
@@ -1853,9 +1853,9 @@ struct UNetModel {
 
         // out
         tensors[prefix + "out.0.weight"] = out_0_w;
-        tensors[prefix + "out.0.bias"] = out_0_b;
+        tensors[prefix + "out.0.bias"]   = out_0_b;
         tensors[prefix + "out.2.weight"] = out_2_w;
-        tensors[prefix + "out.2.bias"] = out_2_b;
+        tensors[prefix + "out.2.bias"]   = out_2_b;
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx,
@@ -1873,24 +1873,24 @@ struct UNetModel {
 
         // time_embed
         auto emb = ggml_mul_mat(ctx, time_embed_0_w, t_emb);
-        emb = ggml_add(ctx, ggml_repeat(ctx, time_embed_0_b, emb), emb);
-        emb = ggml_silu_inplace(ctx, emb);
-        emb = ggml_mul_mat(ctx, time_embed_2_w, emb);
-        emb = ggml_add(ctx, ggml_repeat(ctx, time_embed_2_b, emb), emb);  // [N, time_embed_dim]
+        emb      = ggml_add(ctx, ggml_repeat(ctx, time_embed_0_b, emb), emb);
+        emb      = ggml_silu_inplace(ctx, emb);
+        emb      = ggml_mul_mat(ctx, time_embed_2_w, emb);
+        emb      = ggml_add(ctx, ggml_repeat(ctx, time_embed_2_b, emb), emb);  // [N, time_embed_dim]
 
         // input_blocks
         std::vector<struct ggml_tensor*> hs;
         // input block 0
         auto h = ggml_conv_2d(ctx, input_block_0_w, x, 1, 1, 1, 1, 1, 1);  // [N, model_channels, h, w]
-        h = ggml_add(ctx,
-                     h,
-                     ggml_repeat(ctx,
-                                 ggml_reshape_4d(ctx, input_block_0_b, 1, 1, input_block_0_b->ne[0], 1),
-                                 h));  // [N, model_channels, h, w]
+        h      = ggml_add(ctx,
+                          h,
+                          ggml_repeat(ctx,
+                                      ggml_reshape_4d(ctx, input_block_0_b, 1, 1, input_block_0_b->ne[0], 1),
+                                      h));  // [N, model_channels, h, w]
         hs.push_back(h);
         // input block 1-11
         int len_mults = sizeof(channel_mult) / sizeof(int);
-        int ds = 1;
+        int ds        = 1;
         for (int i = 0; i < len_mults; i++) {
             int mult = channel_mult[i];
             for (int j = 0; j < num_res_blocks; j++) {
@@ -2021,18 +2021,18 @@ struct ResnetBlock {
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         tensors[prefix + "norm1.weight"] = norm1_w;
-        tensors[prefix + "norm1.bias"] = norm1_b;
+        tensors[prefix + "norm1.bias"]   = norm1_b;
         tensors[prefix + "conv1.weight"] = conv1_w;
-        tensors[prefix + "conv1.bias"] = conv1_b;
+        tensors[prefix + "conv1.bias"]   = conv1_b;
 
         tensors[prefix + "norm2.weight"] = norm2_w;
-        tensors[prefix + "norm2.bias"] = norm2_b;
+        tensors[prefix + "norm2.bias"]   = norm2_b;
         tensors[prefix + "conv2.weight"] = conv2_w;
-        tensors[prefix + "conv2.bias"] = conv2_b;
+        tensors[prefix + "conv2.bias"]   = conv2_b;
 
         if (out_channels != in_channels) {
             tensors[prefix + "nin_shortcut.weight"] = nin_shortcut_w;
-            tensors[prefix + "nin_shortcut.bias"] = nin_shortcut_b;
+            tensors[prefix + "nin_shortcut.bias"]   = nin_shortcut_b;
         }
     }
 
@@ -2041,16 +2041,16 @@ struct ResnetBlock {
 
         // group norm 32
         auto h = ggml_group_norm_32(ctx, z);
-        h = ggml_mul(ctx,
-                     ggml_repeat(ctx,
-                                 ggml_reshape_4d(ctx, norm1_w, 1, 1, norm1_w->ne[0], 1),
-                                 h),
-                     h);
-        h = ggml_add(ctx,
-                     h,
-                     ggml_repeat(ctx,
-                                 ggml_reshape_4d(ctx, norm1_b, 1, 1, norm1_b->ne[0], 1),
-                                 h));
+        h      = ggml_mul(ctx,
+                          ggml_repeat(ctx,
+                                      ggml_reshape_4d(ctx, norm1_w, 1, 1, norm1_w->ne[0], 1),
+                                      h),
+                          h);
+        h      = ggml_add(ctx,
+                          h,
+                          ggml_repeat(ctx,
+                                      ggml_reshape_4d(ctx, norm1_b, 1, 1, norm1_b->ne[0], 1),
+                                      h));
         // silu
         h = ggml_silu_inplace(ctx, h);
         // conv2d
@@ -2121,28 +2121,28 @@ struct AttnBlock {
     void init_params(struct ggml_context* ctx, ggml_type wtype) {
         norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
         norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
-        q_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
-        q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
-        k_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
-        k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
-        v_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
-        v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        q_w    = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        k_w    = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        k_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        v_w    = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
 
         proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
         proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
     }
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        tensors[prefix + "norm.weight"] = norm_w;
-        tensors[prefix + "norm.bias"] = norm_b;
-        tensors[prefix + "q.weight"] = q_w;
-        tensors[prefix + "q.bias"] = q_b;
-        tensors[prefix + "k.weight"] = k_w;
-        tensors[prefix + "k.bias"] = k_b;
-        tensors[prefix + "v.weight"] = v_w;
-        tensors[prefix + "v.bias"] = v_b;
+        tensors[prefix + "norm.weight"]     = norm_w;
+        tensors[prefix + "norm.bias"]       = norm_b;
+        tensors[prefix + "q.weight"]        = q_w;
+        tensors[prefix + "q.bias"]          = q_b;
+        tensors[prefix + "k.weight"]        = k_w;
+        tensors[prefix + "k.bias"]          = k_b;
+        tensors[prefix + "v.weight"]        = v_w;
+        tensors[prefix + "v.bias"]          = v_b;
         tensors[prefix + "proj_out.weight"] = proj_out_w;
-        tensors[prefix + "proj_out.bias"] = proj_out_b;
+        tensors[prefix + "proj_out.bias"]   = proj_out_b;
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
@@ -2150,9 +2150,9 @@ struct AttnBlock {
 
         // group norm 32
         auto h_ = ggml_group_norm_32(ctx, x);
-        h_ = ggml_add(ctx,
-                      ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_w, 1, 1, norm_w->ne[0], 1), h_), h_),
-                      ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_b, 1, 1, norm_b->ne[0], 1), h_));
+        h_      = ggml_add(ctx,
+                           ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_w, 1, 1, norm_w->ne[0], 1), h_), h_),
+                           ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_b, 1, 1, norm_b->ne[0], 1), h_));
 
         const int64_t n = h_->ne[3];
         const int64_t c = h_->ne[2];
@@ -2160,27 +2160,27 @@ struct AttnBlock {
         const int64_t w = h_->ne[0];
         // q
         auto q = ggml_conv_2d(ctx, q_w, h_, 1, 1, 0, 0, 1, 1);
-        q = ggml_add(ctx,
-                     q,
-                     ggml_repeat(ctx,
-                                 ggml_reshape_4d(ctx, q_b, 1, 1, q_b->ne[0], 1),
-                                 q));  // [N, in_channels, h, w]
+        q      = ggml_add(ctx,
+                          q,
+                          ggml_repeat(ctx,
+                                      ggml_reshape_4d(ctx, q_b, 1, 1, q_b->ne[0], 1),
+                                      q));  // [N, in_channels, h, w]
 
         // k
         auto k = ggml_conv_2d(ctx, k_w, h_, 1, 1, 0, 0, 1, 1);
-        k = ggml_add(ctx,
-                     k,
-                     ggml_repeat(ctx,
-                                 ggml_reshape_4d(ctx, k_b, 1, 1, k_b->ne[0], 1),
-                                 k));  // [N, in_channels, h, w]
+        k      = ggml_add(ctx,
+                          k,
+                          ggml_repeat(ctx,
+                                      ggml_reshape_4d(ctx, k_b, 1, 1, k_b->ne[0], 1),
+                                      k));  // [N, in_channels, h, w]
 
         // v
         auto v = ggml_conv_2d(ctx, v_w, h_, 1, 1, 0, 0, 1, 1);
-        v = ggml_add(ctx,
-                     v,
-                     ggml_repeat(ctx,
-                                 ggml_reshape_4d(ctx, v_b, 1, 1, v_b->ne[0], 1),
-                                 v));  // [N, in_channels, h, w]
+        v      = ggml_add(ctx,
+                          v,
+                          ggml_repeat(ctx,
+                                      ggml_reshape_4d(ctx, v_b, 1, 1, v_b->ne[0], 1),
+                                      v));  // [N, in_channels, h, w]
 
         q = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3));  // [N, h, w, in_channels]
         q = ggml_reshape_3d(ctx, q, c, h * w, n);              // [N, h * w, in_channels]
@@ -2189,10 +2189,10 @@ struct AttnBlock {
         k = ggml_reshape_3d(ctx, k, c, h * w, n);              // [N, h * w, in_channels]
 
         auto w_ = ggml_mul_mat(ctx, k, q);  // [N, h * w, h * w]
-        w_ = ggml_scale_inplace(ctx, w_, ggml_new_f32(ctx, 1.0f / sqrt((float)c)));
-        w_ = ggml_soft_max_inplace(ctx, w_);
+        w_      = ggml_scale_inplace(ctx, w_, ggml_new_f32(ctx, 1.0f / sqrt((float)c)));
+        w_      = ggml_soft_max_inplace(ctx, w_);
 
-        v = ggml_reshape_3d(ctx, v, h * w, c, n);                // [N, in_channels, h * w]
+        v  = ggml_reshape_3d(ctx, v, h * w, c, n);               // [N, in_channels, h * w]
         h_ = ggml_mul_mat(ctx, v, w_);                           // [N, h * w, in_channels]
         h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
         h_ = ggml_reshape_4d(ctx, h_, w, h, c, n);               // [N, in_channels, h, w]
@@ -2211,12 +2211,12 @@ struct AttnBlock {
 
 // ldm.modules.diffusionmodules.model.Encoder
 struct Encoder {
-    int embed_dim = 4;
-    int ch = 128;
-    int z_channels = 4;
-    int in_channels = 3;
+    int embed_dim      = 4;
+    int ch             = 128;
+    int z_channels     = 4;
+    int in_channels    = 3;
     int num_res_blocks = 2;
-    int ch_mult[4] = {1, 2, 4, 4};
+    int ch_mult[4]     = {1, 2, 4, 4};
 
     struct ggml_tensor* conv_in_w;  // [ch, in_channels, 3, 3]
     struct ggml_tensor* conv_in_b;  // [ch, ]
@@ -2250,28 +2250,28 @@ struct Encoder {
             }
             int block_out = ch * ch_mult[i];
             for (int j = 0; j < num_res_blocks; j++) {
-                down_blocks[i][j].in_channels = block_in;
+                down_blocks[i][j].in_channels  = block_in;
                 down_blocks[i][j].out_channels = block_out;
-                block_in = block_out;
+                block_in                       = block_out;
             }
             if (i != len_mults - 1) {
-                down_samples[i].channels = block_in;
-                down_samples[i].out_channels = block_in;
+                down_samples[i].channels       = block_in;
+                down_samples[i].out_channels   = block_in;
                 down_samples[i].vae_downsample = true;
             }
         }
 
-        mid.block_1.in_channels = block_in;
+        mid.block_1.in_channels  = block_in;
         mid.block_1.out_channels = block_in;
-        mid.attn_1.in_channels = block_in;
-        mid.block_2.in_channels = block_in;
+        mid.attn_1.in_channels   = block_in;
+        mid.block_2.in_channels  = block_in;
         mid.block_2.out_channels = block_in;
     }
 
     size_t compute_params_mem_size(ggml_type wtype) {
         double mem_size = 0;
-        int len_mults = sizeof(ch_mult) / sizeof(int);
-        int block_in = ch * ch_mult[len_mults - 1];
+        int len_mults   = sizeof(ch_mult) / sizeof(int);
+        int block_in    = ch * ch_mult[len_mults - 1];
 
         mem_size += ch * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv_in_w
         mem_size += ch * ggml_type_sizef(GGML_TYPE_F32);                        // conv_in_b
@@ -2301,7 +2301,7 @@ struct Encoder {
 
     void init_params(struct ggml_context* ctx, ggml_type wtype) {
         int len_mults = sizeof(ch_mult) / sizeof(int);
-        int block_in = ch * ch_mult[len_mults - 1];
+        int block_in  = ch * ch_mult[len_mults - 1];
 
         conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, ch);
         conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch);
@@ -2328,11 +2328,11 @@ struct Encoder {
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         tensors[prefix + "norm_out.weight"] = norm_out_w;
-        tensors[prefix + "norm_out.bias"] = norm_out_b;
-        tensors[prefix + "conv_in.weight"] = conv_in_w;
-        tensors[prefix + "conv_in.bias"] = conv_in_b;
+        tensors[prefix + "norm_out.bias"]   = norm_out_b;
+        tensors[prefix + "conv_in.weight"]  = conv_in_w;
+        tensors[prefix + "conv_in.bias"]    = conv_in_b;
         tensors[prefix + "conv_out.weight"] = conv_out_w;
-        tensors[prefix + "conv_out.bias"] = conv_out_b;
+        tensors[prefix + "conv_out.bias"]   = conv_out_b;
 
         mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
         mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
@@ -2353,12 +2353,12 @@ struct Encoder {
         // x: [N, in_channels, h, w]
 
         // conv_in
-        auto h = ggml_conv_2d(ctx, conv_in_w, x, 1, 1, 1, 1, 1, 1);
-        h = ggml_add(ctx,
-                     h,
-                     ggml_repeat(ctx,
-                                 ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1),
-                                 h));  // [N, ch, h, w]
+        auto h        = ggml_conv_2d(ctx, conv_in_w, x, 1, 1, 1, 1, 1, 1);
+        h             = ggml_add(ctx,
+                                 h,
+                                 ggml_repeat(ctx,
+                                             ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1),
+                                             h));  // [N, ch, h, w]
         int len_mults = sizeof(ch_mult) / sizeof(int);
         for (int i = 0; i < len_mults; i++) {
             for (int j = 0; j < num_res_blocks; j++) {
@@ -2397,12 +2397,12 @@ struct Encoder {
 
 // ldm.modules.diffusionmodules.model.Decoder
 struct Decoder {
-    int embed_dim = 4;
-    int ch = 128;
-    int z_channels = 4;
-    int out_ch = 3;
+    int embed_dim      = 4;
+    int ch             = 128;
+    int z_channels     = 4;
+    int out_ch         = 3;
     int num_res_blocks = 2;
-    int ch_mult[4] = {1, 2, 4, 4};
+    int ch_mult[4]     = {1, 2, 4, 4};
 
     // block_in = ch *  ch_mult[-1], 512
     struct ggml_tensor* conv_in_w;  // [block_in, z_channels, 3, 3]
@@ -2426,24 +2426,24 @@ struct Decoder {
 
     Decoder() {
         int len_mults = sizeof(ch_mult) / sizeof(int);
-        int block_in = ch * ch_mult[len_mults - 1];
+        int block_in  = ch * ch_mult[len_mults - 1];
 
-        mid.block_1.in_channels = block_in;
+        mid.block_1.in_channels  = block_in;
         mid.block_1.out_channels = block_in;
-        mid.attn_1.in_channels = block_in;
-        mid.block_2.in_channels = block_in;
+        mid.attn_1.in_channels   = block_in;
+        mid.block_2.in_channels  = block_in;
         mid.block_2.out_channels = block_in;
 
         for (int i = len_mults - 1; i >= 0; i--) {
-            int mult = ch_mult[i];
+            int mult      = ch_mult[i];
             int block_out = ch * mult;
             for (int j = 0; j < num_res_blocks + 1; j++) {
-                up_blocks[i][j].in_channels = block_in;
+                up_blocks[i][j].in_channels  = block_in;
                 up_blocks[i][j].out_channels = block_out;
-                block_in = block_out;
+                block_in                     = block_out;
             }
             if (i != 0) {
-                up_samples[i - 1].channels = block_in;
+                up_samples[i - 1].channels     = block_in;
                 up_samples[i - 1].out_channels = block_in;
             }
         }
@@ -2451,8 +2451,8 @@ struct Decoder {
 
     size_t compute_params_mem_size(ggml_type wtype) {
         double mem_size = 0;
-        int len_mults = sizeof(ch_mult) / sizeof(int);
-        int block_in = ch * ch_mult[len_mults - 1];
+        int len_mults   = sizeof(ch_mult) / sizeof(int);
+        int block_in    = ch * ch_mult[len_mults - 1];
 
         mem_size += block_in * z_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv_in_w
         mem_size += block_in * ggml_type_sizef(GGML_TYPE_F32);                       // conv_in_b
@@ -2482,7 +2482,7 @@ struct Decoder {
 
     void init_params(struct ggml_context* ctx, ggml_type wtype) {
         int len_mults = sizeof(ch_mult) / sizeof(int);
-        int block_in = ch * ch_mult[len_mults - 1];
+        int block_in  = ch * ch_mult[len_mults - 1];
 
         norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
         norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
@@ -2509,11 +2509,11 @@ struct Decoder {
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         tensors[prefix + "norm_out.weight"] = norm_out_w;
-        tensors[prefix + "norm_out.bias"] = norm_out_b;
-        tensors[prefix + "conv_in.weight"] = conv_in_w;
-        tensors[prefix + "conv_in.bias"] = conv_in_b;
+        tensors[prefix + "norm_out.bias"]   = norm_out_b;
+        tensors[prefix + "conv_in.weight"]  = conv_in_w;
+        tensors[prefix + "conv_in.bias"]    = conv_in_b;
         tensors[prefix + "conv_out.weight"] = conv_out_w;
-        tensors[prefix + "conv_out.bias"] = conv_out_b;
+        tensors[prefix + "conv_out.bias"]   = conv_out_b;
 
         mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
         mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
@@ -2535,11 +2535,11 @@ struct Decoder {
 
         // conv_in
         auto h = ggml_conv_2d(ctx, conv_in_w, z, 1, 1, 1, 1, 1, 1);
-        h = ggml_add(ctx,
-                     h,
-                     ggml_repeat(ctx,
-                                 ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1),
-                                 h));  // [N, block_in, h, w]
+        h      = ggml_add(ctx,
+                          h,
+                          ggml_repeat(ctx,
+                                      ggml_reshape_4d(ctx, conv_in_b, 1, 1, conv_in_b->ne[0], 1),
+                                      h));  // [N, block_in, h, w]
 
         h = mid.block_1.forward(ctx, h);
         h = mid.attn_1.forward(ctx, h);
@@ -2580,15 +2580,15 @@ struct Decoder {
 // ldm.models.autoencoder.AutoencoderKL
 struct AutoEncoderKL {
     bool decode_only = true;
-    int embed_dim = 4;
+    int embed_dim    = 4;
     struct
     {
-        int z_channels = 4;
-        int resolution = 256;
-        int in_channels = 3;
-        int out_ch = 3;
-        int ch = 128;
-        int ch_mult[4] = {1, 2, 4, 4};
+        int z_channels     = 4;
+        int resolution     = 256;
+        int in_channels    = 3;
+        int out_ch         = 3;
+        int ch             = 128;
+        int ch_mult[4]     = {1, 2, 4, 4};
         int num_res_blocks = 2;
     } dd_config;
 
@@ -2606,14 +2606,14 @@ struct AutoEncoderKL {
         assert(sizeof(dd_config.ch_mult) == sizeof(encoder.ch_mult));
         assert(sizeof(dd_config.ch_mult) == sizeof(decoder.ch_mult));
 
-        encoder.embed_dim = embed_dim;
-        decoder.embed_dim = embed_dim;
-        encoder.ch = dd_config.ch;
-        decoder.ch = dd_config.ch;
-        encoder.z_channels = dd_config.z_channels;
-        decoder.z_channels = dd_config.z_channels;
-        encoder.in_channels = dd_config.in_channels;
-        decoder.out_ch = dd_config.out_ch;
+        encoder.embed_dim      = embed_dim;
+        decoder.embed_dim      = embed_dim;
+        encoder.ch             = dd_config.ch;
+        decoder.ch             = dd_config.ch;
+        encoder.z_channels     = dd_config.z_channels;
+        decoder.z_channels     = dd_config.z_channels;
+        encoder.in_channels    = dd_config.in_channels;
+        decoder.out_ch         = dd_config.out_ch;
         encoder.num_res_blocks = dd_config.num_res_blocks;
 
         int len_mults = sizeof(dd_config.ch_mult) / sizeof(int);
@@ -2654,12 +2654,12 @@ struct AutoEncoderKL {
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         if (!decode_only) {
             tensors[prefix + "quant_conv.weight"] = quant_conv_w;
-            tensors[prefix + "quant_conv.bias"] = quant_conv_b;
+            tensors[prefix + "quant_conv.bias"]   = quant_conv_b;
             encoder.map_by_name(tensors, prefix + "encoder.");
         }
 
         tensors[prefix + "post_quant_conv.weight"] = post_quant_conv_w;
-        tensors[prefix + "post_quant_conv.bias"] = post_quant_conv_b;
+        tensors[prefix + "post_quant_conv.bias"]   = post_quant_conv_b;
         decoder.map_by_name(tensors, prefix + "decoder.");
     }
 
@@ -2668,12 +2668,12 @@ struct AutoEncoderKL {
 
         // post_quant_conv
         auto h = ggml_conv_2d(ctx, post_quant_conv_w, z, 1, 1, 0, 0, 1, 1);
-        h = ggml_add(ctx,
-                     h,
-                     ggml_repeat(ctx,
-                                 ggml_reshape_4d(ctx, post_quant_conv_b, 1, 1, post_quant_conv_b->ne[0], 1),
-                                 h));  // [N, z_channels, h, w]
-        h = decoder.forward(ctx, h);
+        h      = ggml_add(ctx,
+                          h,
+                          ggml_repeat(ctx,
+                                      ggml_reshape_4d(ctx, post_quant_conv_b, 1, 1, post_quant_conv_b->ne[0], 1),
+                                      h));  // [N, z_channels, h, w]
+        h      = decoder.forward(ctx, h);
         return h;
     }
 
@@ -2716,22 +2716,22 @@ struct SigmaSchedule {
                 low_idx++;
             }
         }
-        low_idx = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
+        low_idx      = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
         int high_idx = low_idx + 1;
 
-        float low = log_sigmas[low_idx];
+        float low  = log_sigmas[low_idx];
         float high = log_sigmas[high_idx];
-        float w = (low - log_sigma) / (low - high);
-        w = std::max(0.f, std::min(1.f, w));
-        float t = (1.0f - w) * low_idx + w * high_idx;
+        float w    = (low - log_sigma) / (low - high);
+        w          = std::max(0.f, std::min(1.f, w));
+        float t    = (1.0f - w) * low_idx + w * high_idx;
 
         return t;
     }
 
     float t_to_sigma(float t) {
-        int low_idx = static_cast<int>(std::floor(t));
-        int high_idx = static_cast<int>(std::ceil(t));
-        float w = t - static_cast<float>(low_idx);
+        int low_idx     = static_cast<int>(std::floor(t));
+        int high_idx    = static_cast<int>(std::ceil(t));
+        float w         = t - static_cast<float>(low_idx);
         float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
         return std::exp(log_sigma);
     }
@@ -2767,7 +2767,7 @@ struct KarrasSchedule : SigmaSchedule {
         // but does anybody ever bother to touch them?
         float sigma_min = 0.1f;
         float sigma_max = 10.f;
-        float rho = 7.f;
+        float rho       = 7.f;
 
         std::vector<float> result(n + 1);
 
@@ -2783,7 +2783,7 @@ struct KarrasSchedule : SigmaSchedule {
 };
 
 struct Denoiser {
-    std::shared_ptr<SigmaSchedule> schedule = std::make_shared<DiscreteSchedule>();
+    std::shared_ptr<SigmaSchedule> schedule              = std::make_shared<DiscreteSchedule>();
     virtual std::vector<float> get_scalings(float sigma) = 0;
 };
 
@@ -2792,7 +2792,7 @@ struct CompVisDenoiser : public Denoiser {
 
     std::vector<float> get_scalings(float sigma) {
         float c_out = -sigma;
-        float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        float c_in  = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
         return {c_out, c_in};
     }
 };
@@ -2802,8 +2802,8 @@ struct CompVisVDenoiser : public Denoiser {
 
     std::vector<float> get_scalings(float sigma) {
         float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
-        float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
-        float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        float c_out  = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        float c_in   = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
         return {c_skip, c_out, c_in};
     }
 };
@@ -2811,23 +2811,23 @@ struct CompVisVDenoiser : public Denoiser {
 /*=============================================== StableDiffusionGGML ================================================*/
 
 class StableDiffusionGGML {
-   public:
+public:
     ggml_context* clip_params_ctx = NULL;
     ggml_context* unet_params_ctx = NULL;
-    ggml_context* vae_params_ctx = NULL;
+    ggml_context* vae_params_ctx  = NULL;
 
-    bool dynamic = true;
-    bool vae_decode_only = false;
+    bool dynamic                 = true;
+    bool vae_decode_only         = false;
     bool free_params_immediately = false;
 
-    std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
-    int32_t ftype = 1;
-    int n_threads = -1;
-    float scale_factor = 0.18215f;
-    size_t max_mem_size = 0;
+    std::shared_ptr<RNG> rng    = std::make_shared<STDDefaultRNG>();
+    int32_t ftype               = 1;
+    int n_threads               = -1;
+    float scale_factor          = 0.18215f;
+    size_t max_mem_size         = 0;
     size_t curr_params_mem_size = 0;
-    size_t max_params_mem_size = 0;
-    size_t max_rt_mem_size = 0;
+    size_t max_params_mem_size  = 0;
+    size_t max_rt_mem_size      = 0;
 
     FrozenCLIPEmbedderWithCustomWords cond_stage_model;
     UNetModel diffusion_model;
@@ -2924,7 +2924,7 @@ class StableDiffusionGGML {
 
         if (model_type == SD2) {
             cond_stage_model = FrozenCLIPEmbedderWithCustomWords((ModelType)model_type);
-            diffusion_model = UNetModel((ModelType)model_type);
+            diffusion_model  = UNetModel((ModelType)model_type);
         }
 
         ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype)(ftype & 0xFFFF));
@@ -2970,10 +2970,10 @@ class StableDiffusionGGML {
             LOG_DEBUG("clip params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
 
             struct ggml_init_params params;
-            params.mem_size = static_cast<size_t>(ctx_size);
+            params.mem_size   = static_cast<size_t>(ctx_size);
             params.mem_buffer = NULL;
-            params.no_alloc = false;
-            params.dynamic = false;
+            params.no_alloc   = false;
+            params.dynamic    = false;
 
             clip_params_ctx = ggml_init(params);
             if (!clip_params_ctx) {
@@ -2989,10 +2989,10 @@ class StableDiffusionGGML {
             LOG_DEBUG("unet params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
 
             struct ggml_init_params params;
-            params.mem_size = static_cast<size_t>(ctx_size);
+            params.mem_size   = static_cast<size_t>(ctx_size);
             params.mem_buffer = NULL;
-            params.no_alloc = false;
-            params.dynamic = false;
+            params.no_alloc   = false;
+            params.dynamic    = false;
 
             unet_params_ctx = ggml_init(params);
             if (!unet_params_ctx) {
@@ -3010,10 +3010,10 @@ class StableDiffusionGGML {
             LOG_DEBUG("vae params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
 
             struct ggml_init_params params;
-            params.mem_size = static_cast<size_t>(ctx_size);
+            params.mem_size   = static_cast<size_t>(ctx_size);
             params.mem_buffer = NULL;
-            params.no_alloc = false;
-            params.dynamic = false;
+            params.no_alloc   = false;
+            params.dynamic    = false;
 
             vae_params_ctx = ggml_init(params);
             if (!vae_params_ctx) {
@@ -3048,7 +3048,7 @@ class StableDiffusionGGML {
         // load weights
         float alphas_cumprod[TIMESTEPS];
         {
-            int n_tensors = 0;
+            int n_tensors     = 0;
             size_t total_size = 0;
 
             while (true) {
@@ -3065,7 +3065,7 @@ class StableDiffusionGGML {
                 }
 
                 int32_t nelements = 1;
-                int32_t ne[4] = {1, 1, 1, 1};
+                int32_t ne[4]     = {1, 1, 1, 1};
                 for (int i = 0; i < n_dims; ++i) {
                     file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
                     nelements *= ne[i];
@@ -3147,8 +3147,8 @@ class StableDiffusionGGML {
             }
             LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
         }
-        max_params_mem_size = ggml_used_mem(clip_params_ctx) + ggml_used_mem(unet_params_ctx) + ggml_used_mem(vae_params_ctx);
-        max_mem_size = max_params_mem_size;
+        max_params_mem_size  = ggml_used_mem(clip_params_ctx) + ggml_used_mem(unet_params_ctx) + ggml_used_mem(vae_params_ctx);
+        max_mem_size         = max_params_mem_size;
         curr_params_mem_size = max_params_mem_size;
         LOG_INFO("total params size = %.2fMB (clip %.2fMB, unet %.2fMB, vae %.2fMB)",
                  max_params_mem_size / 1024.0 / 1024.0,
@@ -3163,10 +3163,10 @@ class StableDiffusionGGML {
         bool is_using_v_parameterization = false;
         if (model_type == SD2) {
             struct ggml_init_params params;
-            params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10M
-            params.mem_buffer = NULL;
-            params.no_alloc = false;
-            params.dynamic = false;
+            params.mem_size          = static_cast<size_t>(10 * 1024) * 1024;  // 10M
+            params.mem_buffer        = NULL;
+            params.no_alloc          = false;
+            params.dynamic           = false;
             struct ggml_context* ctx = ggml_init(params);
             if (!ctx) {
                 LOG_ERROR("ggml_init() failed");
@@ -3205,8 +3205,8 @@ class StableDiffusionGGML {
 
         for (int i = 0; i < TIMESTEPS; i++) {
             denoiser->schedule->alphas_cumprod[i] = alphas_cumprod[i];
-            denoiser->schedule->sigmas[i] = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
-            denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]);
+            denoiser->schedule->sigmas[i]         = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
+            denoiser->schedule->log_sigmas[i]     = std::log(denoiser->schedule->sigmas[i]);
         }
 
         return true;
@@ -3224,10 +3224,10 @@ class StableDiffusionGGML {
         // calculate the amount of memory required
         {
             struct ggml_init_params params;
-            params.mem_size = ctx_size;
+            params.mem_size   = ctx_size;
             params.mem_buffer = NULL;
-            params.no_alloc = true;
-            params.dynamic = dynamic;
+            params.no_alloc   = true;
+            params.dynamic    = dynamic;
 
             struct ggml_context* ctx = ggml_init(params);
             if (!ctx) {
@@ -3236,15 +3236,15 @@ class StableDiffusionGGML {
             }
 
             ggml_set_dynamic(ctx, false);
-            struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                           // [N, ]
-            struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
+            struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                               // [N, ]
+            struct ggml_tensor* t_emb     = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
             ggml_set_dynamic(ctx, params.dynamic);
 
             struct ggml_tensor* out = diffusion_model.forward(ctx, x_t, NULL, c, t_emb);
             ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
 
             struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
-            cplan = ggml_graph_plan(diffusion_graph, n_threads);
+            cplan                               = ggml_graph_plan(diffusion_graph, n_threads);
 
             ctx_size += cplan.work_size;
             LOG_DEBUG("diffusion context need %.2fMB static memory, with work_size needing %.2fMB",
@@ -3255,10 +3255,10 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size = ctx_size;
+        params.mem_size   = ctx_size;
         params.mem_buffer = NULL;
-        params.no_alloc = false;
-        params.dynamic = dynamic;
+        params.no_alloc   = false;
+        params.dynamic    = dynamic;
 
         struct ggml_context* ctx = ggml_init(params);
         if (!ctx) {
@@ -3267,8 +3267,8 @@ class StableDiffusionGGML {
         }
 
         ggml_set_dynamic(ctx, false);
-        struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                           // [N, ]
-        struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
+        struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                               // [N, ]
+        struct ggml_tensor* t_emb     = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
         ggml_set_dynamic(ctx, params.dynamic);
         ggml_set_f32(timesteps, 999);
         set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels);
@@ -3277,7 +3277,7 @@ class StableDiffusionGGML {
         ggml_hold_dynamic_tensor(out);
 
         struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
-        cplan = ggml_graph_plan(diffusion_graph, n_threads);
+        cplan                               = ggml_graph_plan(diffusion_graph, n_threads);
 
         ggml_set_dynamic(ctx, false);
         struct ggml_tensor* buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
@@ -3291,7 +3291,7 @@ class StableDiffusionGGML {
         double result = 0.f;
 
         {
-            float* vec_x = (float*)x_t->data;
+            float* vec_x   = (float*)x_t->data;
             float* vec_out = (float*)out->data;
 
             int64_t n = ggml_nelements(out);
@@ -3369,10 +3369,10 @@ class StableDiffusionGGML {
         struct ggml_init_params params;
         size_t ctx_size = 10 * 1024 * 1024;  // 10 MB, for padding
         ctx_size += file_size;
-        params.mem_size = ctx_size;
+        params.mem_size   = ctx_size;
         params.mem_buffer = NULL;
-        params.no_alloc = false;
-        params.dynamic = false;
+        params.no_alloc   = false;
+        params.dynamic    = false;
         LOG_DEBUG("lora '%s' params ctx size = % 6.2f MB", lora_name.c_str(), ctx_size / (1024.0 * 1024.0));
         ggml_context* lora_params_ctx = ggml_init(params);
         if (!lora_params_ctx) {
@@ -3385,7 +3385,7 @@ class StableDiffusionGGML {
         int64_t t0 = ggml_time_ms();
         // load weights
         {
-            int n_tensors = 0;
+            int n_tensors     = 0;
             size_t total_size = 0;
 
             while (true) {
@@ -3402,7 +3402,7 @@ class StableDiffusionGGML {
                 }
 
                 int32_t nelements = 1;
-                int32_t ne[4] = {1, 1, 1, 1};
+                int32_t ne[4]     = {1, 1, 1, 1};
                 for (int i = 0; i < n_dims; ++i) {
                     file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
                     nelements *= ne[i];
@@ -3416,7 +3416,7 @@ class StableDiffusionGGML {
 
                 // LOG_DEBUG("load lora tensor %s", name.c_str());
 
-                int64_t ne64[4] = {ne[0], ne[1], ne[2], ne[3]};
+                int64_t ne64[4]            = {ne[0], ne[1], ne[2], ne[3]};
                 struct ggml_tensor* tensor = ggml_new_tensor(lora_params_ctx, (ggml_type)ttype, n_dims, ne64);
                 file.read(reinterpret_cast<char*>(tensor->data), num_bytes);
 
@@ -3426,7 +3426,7 @@ class StableDiffusionGGML {
             }
         }
         lora_tensors[lora_name] = lora_tensor_map;
-        int64_t t1 = ggml_time_ms();
+        int64_t t1              = ggml_time_ms();
         LOG_INFO("lora '%s' params size = %.2fMB",
                  lora_name.c_str(),
                  ggml_used_mem(lora_params_ctx) / 1024.0 / 1024.0);
@@ -3452,7 +3452,7 @@ class StableDiffusionGGML {
             return;
         }
 
-        size_t ctx_size = 500 * 1024 * 1024;  // 500MB
+        size_t ctx_size  = 500 * 1024 * 1024;  // 500MB
         void* mem_buffer = malloc(ctx_size);
         if (!mem_buffer) {
             if (free_params_immediately) {
@@ -3466,8 +3466,8 @@ class StableDiffusionGGML {
         std::set<std::string> applied_lora_tensors;
         for (auto& kv : tensors) {
             const std::string name = kv.first;
-            ggml_tensor* weight = kv.second;
-            std::string ending = ".weight";
+            ggml_tensor* weight    = kv.second;
+            std::string ending     = ".weight";
             if (!ends_with(name, ending)) {
                 continue;
             }
@@ -3475,12 +3475,12 @@ class StableDiffusionGGML {
             // find corresponding lora tensors
             std::string network_name = name.substr(0, name.size() - ending.size());  // remove .weight
             replace_all_chars(network_name, '.', '_');
-            std::string lora_up_name = network_name + ".lora_up.weight";
+            std::string lora_up_name   = network_name + ".lora_up.weight";
             std::string lora_down_name = network_name + ".lora_down.weight";
-            std::string alpha_name = network_name + ".alpha";
-            std::string scale_name = network_name + ".scale";
+            std::string alpha_name     = network_name + ".alpha";
+            std::string scale_name     = network_name + ".scale";
 
-            ggml_tensor* lora_up = NULL;
+            ggml_tensor* lora_up   = NULL;
             ggml_tensor* lora_down = NULL;
 
             float scale = 1.0f;
@@ -3508,10 +3508,10 @@ class StableDiffusionGGML {
             int64_t dim = lora_down->ne[lora_down->n_dims - 1];
             if (lora_tensor_map.find(scale_name) != lora_tensor_map.end()) {
                 ggml_tensor* t = lora_tensor_map[scale_name];
-                scale = ggml_get_f32_1d(t, 0);
+                scale          = ggml_get_f32_1d(t, 0);
             } else if (lora_tensor_map.find(alpha_name) != lora_tensor_map.end()) {
                 ggml_tensor* t = lora_tensor_map[alpha_name];
-                scale = ggml_get_f32_1d(t, 0) / dim;
+                scale          = ggml_get_f32_1d(t, 0) / dim;
             }
 
             // LOG_DEBUG("scale: %f %ld", scale, dim);
@@ -3521,10 +3521,10 @@ class StableDiffusionGGML {
             // apply
             {
                 struct ggml_init_params params;
-                params.mem_size = ctx_size;
+                params.mem_size   = ctx_size;
                 params.mem_buffer = mem_buffer;
-                params.no_alloc = false;
-                params.dynamic = false;
+                params.no_alloc   = false;
+                params.dynamic    = false;
 
                 struct ggml_context* ctx = ggml_init(params);
                 if (!ctx) {
@@ -3538,21 +3538,21 @@ class StableDiffusionGGML {
 
                 ggml_tensor* scale_factor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
                 ggml_set_f32_1d(scale_factor, 0, scale);
-                int64_t lora_up_size_0 = lora_up->ne[lora_up->n_dims - 1];
-                lora_up = ggml_reshape_2d(ctx, lora_up, ggml_nelements(lora_up) / lora_up_size_0, lora_up_size_0);
+                int64_t lora_up_size_0   = lora_up->ne[lora_up->n_dims - 1];
+                lora_up                  = ggml_reshape_2d(ctx, lora_up, ggml_nelements(lora_up) / lora_up_size_0, lora_up_size_0);
                 int64_t lora_down_size_0 = lora_down->ne[lora_down->n_dims - 1];
-                lora_down = ggml_reshape_2d(ctx, lora_down, ggml_nelements(lora_down) / lora_down_size_0, lora_down_size_0);
+                lora_down                = ggml_reshape_2d(ctx, lora_down, ggml_nelements(lora_down) / lora_down_size_0, lora_down_size_0);
 
                 lora_down = ggml_cont(ctx, ggml_transpose(ctx, lora_down));
 
                 if (lora_down->type != GGML_TYPE_F32) {
                     ggml_tensor* lora_down_f32 = ggml_new_tensor(ctx, GGML_TYPE_F32, lora_down->n_dims, lora_down->ne);
-                    lora_down = ggml_cpy_inplace(ctx, lora_down, lora_down_f32);
+                    lora_down                  = ggml_cpy_inplace(ctx, lora_down, lora_down_f32);
                 }
 
                 ggml_tensor* updown = ggml_mul_mat(ctx, lora_up, lora_down);
-                updown = ggml_cont(ctx, ggml_transpose(ctx, updown));
-                updown = ggml_reshape(ctx, updown, weight);
+                updown              = ggml_cont(ctx, ggml_transpose(ctx, updown));
+                updown              = ggml_reshape(ctx, updown, weight);
 
                 GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
 
@@ -3596,7 +3596,7 @@ class StableDiffusionGGML {
         std::unordered_map<std::string, float> lora_state_diff;
         for (auto& kv : lora_state) {
             const std::string& lora_name = kv.first;
-            float multiplier = kv.second;
+            float multiplier             = kv.second;
 
             if (curr_lora_state.find(lora_name) != curr_lora_state.end()) {
                 float curr_multiplier = curr_lora_state[lora_name];
@@ -3617,20 +3617,20 @@ class StableDiffusionGGML {
     }
 
     ggml_tensor* get_learned_condition(ggml_context* res_ctx, const std::string& text) {
-        auto tokens_and_weights = cond_stage_model.tokenize(text,
-                                                            cond_stage_model.text_model.max_position_embeddings,
-                                                            true);
-        std::vector<int>& tokens = tokens_and_weights.first;
+        auto tokens_and_weights     = cond_stage_model.tokenize(text,
+                                                                cond_stage_model.text_model.max_position_embeddings,
+                                                                true);
+        std::vector<int>& tokens    = tokens_and_weights.first;
         std::vector<float>& weights = tokens_and_weights.second;
         struct ggml_cplan cplan;
         size_t ctx_size = 10 * 1024 * 1024;  // 10MB
         // calculate the amount of memory required
         {
             struct ggml_init_params params;
-            params.mem_size = ctx_size;
+            params.mem_size   = ctx_size;
             params.mem_buffer = NULL;
-            params.no_alloc = true;
-            params.dynamic = dynamic;
+            params.no_alloc   = true;
+            params.dynamic    = dynamic;
 
             struct ggml_context* ctx = ggml_init(params);
             if (!ctx) {
@@ -3645,7 +3645,7 @@ class StableDiffusionGGML {
             struct ggml_tensor* hidden_states = cond_stage_model.text_model.forward(ctx, input_ids);
 
             struct ggml_cgraph* cond_graph = ggml_build_forward_ctx(ctx, hidden_states);
-            cplan = ggml_graph_plan(cond_graph, n_threads);
+            cplan                          = ggml_graph_plan(cond_graph, n_threads);
             ctx_size += cplan.work_size;
 
             ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
@@ -3657,10 +3657,10 @@ class StableDiffusionGGML {
 
         // allocate the required memory and compute forward
         struct ggml_init_params params;
-        params.mem_size = ctx_size;
+        params.mem_size   = ctx_size;
         params.mem_buffer = NULL;
-        params.no_alloc = false;
-        params.dynamic = dynamic;
+        params.no_alloc   = false;
+        params.dynamic    = dynamic;
 
         struct ggml_context* ctx = ggml_init(params);
         if (!ctx) {
@@ -3673,7 +3673,7 @@ class StableDiffusionGGML {
         ggml_set_dynamic(ctx, params.dynamic);
 
         struct ggml_tensor* hidden_states = cond_stage_model.text_model.forward(ctx, input_ids);
-        struct ggml_cgraph* cond_graph = ggml_build_forward_ctx(ctx, hidden_states);
+        struct ggml_cgraph* cond_graph    = ggml_build_forward_ctx(ctx, hidden_states);
         LOG_DEBUG("building condition graph completed: %d nodes, %d leafs",
                   cond_graph->n_nodes, cond_graph->n_leafs);
 
@@ -3687,10 +3687,10 @@ class StableDiffusionGGML {
         ggml_tensor* result = ggml_dup_tensor(res_ctx, hidden_states);  // [N, n_token, hidden_size]
 
         {
-            int64_t nelements = ggml_nelements(hidden_states);
+            int64_t nelements   = ggml_nelements(hidden_states);
             float original_mean = 0.f;
-            float new_mean = 0.f;
-            float* vec = (float*)hidden_states->data;
+            float new_mean      = 0.f;
+            float* vec          = (float*)hidden_states->data;
             for (int i = 0; i < nelements; i++) {
                 original_mean += vec[i] / nelements * 1.0f;
             }
@@ -3762,10 +3762,10 @@ class StableDiffusionGGML {
         // calculate the amount of memory required
         {
             struct ggml_init_params params;
-            params.mem_size = ctx_size;
+            params.mem_size   = ctx_size;
             params.mem_buffer = NULL;
-            params.no_alloc = true;
-            params.dynamic = dynamic;
+            params.no_alloc   = true;
+            params.dynamic    = dynamic;
 
             struct ggml_context* ctx = ggml_init(params);
             if (!ctx) {
@@ -3775,16 +3775,16 @@ class StableDiffusionGGML {
 
             ggml_set_dynamic(ctx, false);
             struct ggml_tensor* noised_input = ggml_dup_tensor(ctx, x_t);
-            struct ggml_tensor* context = ggml_dup_tensor(ctx, c);
-            struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                           // [N, ]
-            struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
+            struct ggml_tensor* context      = ggml_dup_tensor(ctx, c);
+            struct ggml_tensor* timesteps    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                               // [N, ]
+            struct ggml_tensor* t_emb        = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
             ggml_set_dynamic(ctx, params.dynamic);
 
             struct ggml_tensor* out = diffusion_model.forward(ctx, noised_input, NULL, context, t_emb);
             ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
 
             struct ggml_cgraph* diffusion_graph = ggml_build_forward_ctx(ctx, out);
-            cplan = ggml_graph_plan(diffusion_graph, n_threads);
+            cplan                               = ggml_graph_plan(diffusion_graph, n_threads);
 
             ctx_size += cplan.work_size;
             LOG_DEBUG("diffusion context need %.2fMB static memory, with work_size needing %.2fMB",
@@ -3795,10 +3795,10 @@ class StableDiffusionGGML {
         }
 
         struct ggml_init_params params;
-        params.mem_size = ctx_size;
+        params.mem_size   = ctx_size;
         params.mem_buffer = NULL;
-        params.no_alloc = false;
-        params.dynamic = dynamic;
+        params.no_alloc   = false;
+        params.dynamic    = dynamic;
 
         struct ggml_context* ctx = ggml_init(params);
         if (!ctx) {
@@ -3808,16 +3808,16 @@ class StableDiffusionGGML {
 
         ggml_set_dynamic(ctx, false);
         struct ggml_tensor* noised_input = ggml_dup_tensor(ctx, x_t);
-        struct ggml_tensor* context = ggml_dup_tensor(ctx, c);
-        struct ggml_tensor* timesteps = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                           // [N, ]
-        struct ggml_tensor* t_emb = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
+        struct ggml_tensor* context      = ggml_dup_tensor(ctx, c);
+        struct ggml_tensor* timesteps    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);                               // [N, ]
+        struct ggml_tensor* t_emb        = new_timestep_embedding(ctx, timesteps, diffusion_model.model_channels);  // [N, model_channels]
         ggml_set_dynamic(ctx, params.dynamic);
 
         struct ggml_tensor* out = diffusion_model.forward(ctx, noised_input, NULL, context, t_emb);
         ggml_hold_dynamic_tensor(out);
 
         struct ggml_cgraph* diffusion_graph = ggml_new_graph(ctx);
-        diffusion_graph->order = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT;
+        diffusion_graph->order              = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT;
         ggml_build_forward_expand(diffusion_graph, out);
         cplan = ggml_graph_plan(diffusion_graph, n_threads);
 
@@ -3837,7 +3837,7 @@ class StableDiffusionGGML {
 
         // denoise wrapper
         ggml_set_dynamic(ctx, false);
-        struct ggml_tensor* out_cond = NULL;
+        struct ggml_tensor* out_cond   = NULL;
         struct ggml_tensor* out_uncond = NULL;
         if (cfg_scale != 1.0f && uc != NULL) {
             out_uncond = ggml_dup_tensor(ctx, x);
@@ -3848,17 +3848,17 @@ class StableDiffusionGGML {
         auto denoise = [&](ggml_tensor* input, float sigma, int step) {
             int64_t t0 = ggml_time_ms();
 
-            float c_skip = 1.0f;
-            float c_out = 1.0f;
-            float c_in = 1.0f;
+            float c_skip               = 1.0f;
+            float c_out                = 1.0f;
+            float c_in                 = 1.0f;
             std::vector<float> scaling = denoiser->get_scalings(sigma);
             if (scaling.size() == 3) {  // CompVisVDenoiser
                 c_skip = scaling[0];
-                c_out = scaling[1];
-                c_in = scaling[2];
+                c_out  = scaling[1];
+                c_in   = scaling[2];
             } else {  // CompVisDenoiser
                 c_out = scaling[0];
-                c_in = scaling[1];
+                c_in  = scaling[1];
             }
 
             float t = denoiser->schedule->sigma_to_t(sigma);
@@ -3888,9 +3888,9 @@ class StableDiffusionGGML {
 
                 // out_uncond + cfg_scale * (out_cond - out_uncond)
                 {
-                    float* vec_out = (float*)out->data;
+                    float* vec_out        = (float*)out->data;
                     float* vec_out_uncond = (float*)out_uncond->data;
-                    float* vec_out_cond = (float*)out_cond->data;
+                    float* vec_out_cond   = (float*)out_cond->data;
 
                     for (int i = 0; i < ggml_nelements(out); i++) {
                         vec_out[i] = vec_out_uncond[i] + cfg_scale * (vec_out_cond[i] - vec_out_uncond[i]);
@@ -3906,8 +3906,8 @@ class StableDiffusionGGML {
             // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
             {
                 float* vec_denoised = (float*)denoised->data;
-                float* vec_input = (float*)input->data;
-                float* vec_out = (float*)out->data;
+                float* vec_input    = (float*)input->data;
+                float* vec_out      = (float*)out->data;
 
                 for (int i = 0; i < ggml_nelements(denoised); i++) {
                     vec_denoised[i] = vec_out[i] * c_out + vec_input[i] * c_skip;
@@ -3934,7 +3934,7 @@ class StableDiffusionGGML {
                 LOG_INFO("sampling using Euler A method");
                 ggml_set_dynamic(ctx, false);
                 struct ggml_tensor* noise = ggml_dup_tensor(ctx, x);
-                struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* d     = ggml_dup_tensor(ctx, x);
                 ggml_set_dynamic(ctx, params.dynamic);
 
                 for (int i = 0; i < steps; i++) {
@@ -3945,8 +3945,8 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
                         float* vec_denoised = (float*)denoised->data;
 
                         for (int i = 0; i < ggml_nelements(d); i++) {
@@ -3955,8 +3955,8 @@ class StableDiffusionGGML {
                     }
 
                     // get_ancestral_step
-                    float sigma_up = std::min(sigmas[i + 1],
-                                              std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
+                    float sigma_up   = std::min(sigmas[i + 1],
+                                                std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
                     float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
 
                     // Euler method
@@ -3976,7 +3976,7 @@ class StableDiffusionGGML {
                         ggml_tensor_set_f32_randn(noise, rng);
                         // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
                         {
-                            float* vec_x = (float*)x->data;
+                            float* vec_x     = (float*)x->data;
                             float* vec_noise = (float*)noise->data;
 
                             for (int i = 0; i < ggml_nelements(x); i++) {
@@ -4001,8 +4001,8 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
                         float* vec_denoised = (float*)denoised->data;
 
                         for (int j = 0; j < ggml_nelements(d); j++) {
@@ -4025,7 +4025,7 @@ class StableDiffusionGGML {
             case HEUN: {
                 LOG_INFO("sampling using Heun method");
                 ggml_set_dynamic(ctx, false);
-                struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* d  = ggml_dup_tensor(ctx, x);
                 struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x);
                 ggml_set_dynamic(ctx, params.dynamic);
 
@@ -4035,8 +4035,8 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
                         float* vec_denoised = (float*)denoised->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
@@ -4056,9 +4056,9 @@ class StableDiffusionGGML {
                         }
                     } else {
                         // Heun step
-                        float* vec_d = (float*)d->data;
+                        float* vec_d  = (float*)d->data;
                         float* vec_d2 = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float* vec_x  = (float*)x->data;
                         float* vec_x2 = (float*)x2->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
@@ -4078,7 +4078,7 @@ class StableDiffusionGGML {
             case DPM2: {
                 LOG_INFO("sampling using DPM2 method");
                 ggml_set_dynamic(ctx, false);
-                struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* d  = ggml_dup_tensor(ctx, x);
                 struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x);
                 ggml_set_dynamic(ctx, params.dynamic);
 
@@ -4088,8 +4088,8 @@ class StableDiffusionGGML {
 
                     // d = (x - denoised) / sigma
                     {
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
                         float* vec_denoised = (float*)denoised->data;
 
                         for (int j = 0; j < ggml_nelements(x); j++) {
@@ -4100,7 +4100,7 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] == 0) {
                         // Euler step
                         // x = x + d * dt
-                        float dt = sigmas[i + 1] - sigmas[i];
+                        float dt     = sigmas[i + 1] - sigmas[i];
                         float* vec_d = (float*)d->data;
                         float* vec_x = (float*)x->data;
 
@@ -4110,11 +4110,11 @@ class StableDiffusionGGML {
                     } else {
                         // DPM-Solver-2
                         float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1])));
-                        float dt_1 = sigma_mid - sigmas[i];
-                        float dt_2 = sigmas[i + 1] - sigmas[i];
+                        float dt_1      = sigma_mid - sigmas[i];
+                        float dt_2      = sigmas[i + 1] - sigmas[i];
 
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float* vec_d  = (float*)d->data;
+                        float* vec_x  = (float*)x->data;
                         float* vec_x2 = (float*)x2->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x2[j] = vec_x[j] + vec_d[j] * dt_1;
@@ -4134,8 +4134,8 @@ class StableDiffusionGGML {
                 LOG_INFO("sampling using DPM++ (2s) a method");
                 ggml_set_dynamic(ctx, false);
                 struct ggml_tensor* noise = ggml_dup_tensor(ctx, x);
-                struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
-                struct ggml_tensor* x2 = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* d     = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* x2    = ggml_dup_tensor(ctx, x);
                 ggml_set_dynamic(ctx, params.dynamic);
 
                 for (int i = 0; i < steps; i++) {
@@ -4143,16 +4143,16 @@ class StableDiffusionGGML {
                     denoise(x, sigmas[i], i + 1);
 
                     // get_ancestral_step
-                    float sigma_up = std::min(sigmas[i + 1],
-                                              std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
+                    float sigma_up   = std::min(sigmas[i + 1],
+                                                std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
                     float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
-                    auto t_fn = [](float sigma) -> float { return -log(sigma); };
-                    auto sigma_fn = [](float t) -> float { return exp(-t); };
+                    auto t_fn        = [](float sigma) -> float { return -log(sigma); };
+                    auto sigma_fn    = [](float t) -> float { return exp(-t); };
 
                     if (sigma_down == 0) {
                         // Euler step
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
                         float* vec_denoised = (float*)denoised->data;
 
                         for (int j = 0; j < ggml_nelements(d); j++) {
@@ -4169,14 +4169,14 @@ class StableDiffusionGGML {
                         }
                     } else {
                         // DPM-Solver++(2S)
-                        float t = t_fn(sigmas[i]);
+                        float t      = t_fn(sigmas[i]);
                         float t_next = t_fn(sigma_down);
-                        float h = t_next - t;
-                        float s = t + 0.5f * h;
+                        float h      = t_next - t;
+                        float s      = t + 0.5f * h;
 
-                        float* vec_d = (float*)d->data;
-                        float* vec_x = (float*)x->data;
-                        float* vec_x2 = (float*)x2->data;
+                        float* vec_d        = (float*)d->data;
+                        float* vec_x        = (float*)x->data;
+                        float* vec_x2       = (float*)x2->data;
                         float* vec_denoised = (float*)denoised->data;
 
                         // First half-step
@@ -4196,7 +4196,7 @@ class StableDiffusionGGML {
                     if (sigmas[i + 1] > 0) {
                         ggml_tensor_set_f32_randn(noise, rng);
                         {
-                            float* vec_x = (float*)x->data;
+                            float* vec_x     = (float*)x->data;
                             float* vec_noise = (float*)noise->data;
 
                             for (int i = 0; i < ggml_nelements(x); i++) {
@@ -4219,13 +4219,13 @@ class StableDiffusionGGML {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
-                    float t = t_fn(sigmas[i]);
-                    float t_next = t_fn(sigmas[i + 1]);
-                    float h = t_next - t;
-                    float a = sigmas[i + 1] / sigmas[i];
-                    float b = exp(-h) - 1.f;
-                    float* vec_x = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
+                    float t                 = t_fn(sigmas[i]);
+                    float t_next            = t_fn(sigmas[i + 1]);
+                    float h                 = t_next - t;
+                    float a                 = sigmas[i + 1] / sigmas[i];
+                    float b                 = exp(-h) - 1.f;
+                    float* vec_x            = (float*)x->data;
+                    float* vec_denoised     = (float*)denoised->data;
                     float* vec_old_denoised = (float*)old_denoised->data;
 
                     if (i == 0 || sigmas[i + 1] == 0) {
@@ -4235,10 +4235,10 @@ class StableDiffusionGGML {
                         }
                     } else {
                         float h_last = t - t_fn(sigmas[i - 1]);
-                        float r = h_last / h;
+                        float r      = h_last / h;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
-                            vec_x[j] = a * vec_x[j] - b * denoised_d;
+                            vec_x[j]         = a * vec_x[j] - b * denoised_d;
                         }
                     }
 
@@ -4261,12 +4261,12 @@ class StableDiffusionGGML {
                     // denoise
                     denoise(x, sigmas[i], i + 1);
 
-                    float t = t_fn(sigmas[i]);
-                    float t_next = t_fn(sigmas[i + 1]);
-                    float h = t_next - t;
-                    float a = sigmas[i + 1] / sigmas[i];
-                    float* vec_x = (float*)x->data;
-                    float* vec_denoised = (float*)denoised->data;
+                    float t                 = t_fn(sigmas[i]);
+                    float t_next            = t_fn(sigmas[i + 1]);
+                    float h                 = t_next - t;
+                    float a                 = sigmas[i + 1] / sigmas[i];
+                    float* vec_x            = (float*)x->data;
+                    float* vec_denoised     = (float*)denoised->data;
                     float* vec_old_denoised = (float*)old_denoised->data;
 
                     if (i == 0 || sigmas[i + 1] == 0) {
@@ -4277,14 +4277,14 @@ class StableDiffusionGGML {
                         }
                     } else {
                         float h_last = t - t_fn(sigmas[i - 1]);
-                        float h_min = std::min(h_last, h);
-                        float h_max = std::max(h_last, h);
-                        float r = h_max / h_min;
-                        float h_d = (h_max + h_min) / 2.f;
-                        float b = exp(-h_d) - 1.f;
+                        float h_min  = std::min(h_last, h);
+                        float h_max  = std::max(h_last, h);
+                        float r      = h_max / h_min;
+                        float h_d    = (h_max + h_min) / 2.f;
+                        float b      = exp(-h_d) - 1.f;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
-                            vec_x[j] = a * vec_x[j] - b * denoised_d;
+                            vec_x[j]         = a * vec_x[j] - b * denoised_d;
                         }
                     }
 
@@ -4299,7 +4299,7 @@ class StableDiffusionGGML {
                 LOG_INFO("sampling using LCM method");
                 ggml_set_dynamic(ctx, false);
                 struct ggml_tensor* noise = ggml_dup_tensor(ctx, x);
-                struct ggml_tensor* d = ggml_dup_tensor(ctx, x);
+                struct ggml_tensor* d     = ggml_dup_tensor(ctx, x);
                 ggml_set_dynamic(ctx, params.dynamic);
 
                 for (int i = 0; i < steps; i++) {
@@ -4310,7 +4310,7 @@ class StableDiffusionGGML {
 
                     // x = denoised
                     {
-                        float* vec_x = (float*)x->data;
+                        float* vec_x        = (float*)x->data;
                         float* vec_denoised = (float*)denoised->data;
                         for (int j = 0; j < ggml_nelements(x); j++) {
                             vec_x[j] = vec_denoised[j];
@@ -4322,7 +4322,7 @@ class StableDiffusionGGML {
                         ggml_tensor_set_f32_randn(noise, rng);
                         // noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
                         {
-                            float* vec_x = (float*)x->data;
+                            float* vec_x     = (float*)x->data;
                             float* vec_noise = (float*)noise->data;
 
                             for (int j = 0; j < ggml_nelements(x); j++) {
@@ -4365,8 +4365,8 @@ class StableDiffusionGGML {
     }
 
     ggml_tensor* encode_first_stage(ggml_context* res_ctx, ggml_tensor* x) {
-        int64_t W = x->ne[0];
-        int64_t H = x->ne[1];
+        int64_t W                  = x->ne[0];
+        int64_t H                  = x->ne[1];
         struct ggml_tensor* result = NULL;
         struct ggml_cplan cplan;
 
@@ -4374,10 +4374,10 @@ class StableDiffusionGGML {
         size_t ctx_size = 10 * 1024 * 1024;  // 10MB
         {
             struct ggml_init_params params;
-            params.mem_size = ctx_size;
+            params.mem_size   = ctx_size;
             params.mem_buffer = NULL;
-            params.no_alloc = true;
-            params.dynamic = dynamic;
+            params.no_alloc   = true;
+            params.dynamic    = dynamic;
 
             struct ggml_context* ctx = ggml_init(params);
             if (!ctx) {
@@ -4389,7 +4389,7 @@ class StableDiffusionGGML {
             ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
 
             struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, moments);
-            cplan = ggml_graph_plan(vae_graph, n_threads);
+            cplan                         = ggml_graph_plan(vae_graph, n_threads);
 
             ctx_size += cplan.work_size;
             LOG_DEBUG("vae context need %.2fMB static memory, with work_size needing %.2fMB",
@@ -4401,10 +4401,10 @@ class StableDiffusionGGML {
 
         {
             struct ggml_init_params params;
-            params.mem_size = ctx_size;
+            params.mem_size   = ctx_size;
             params.mem_buffer = NULL;
-            params.no_alloc = false;
-            params.dynamic = dynamic;
+            params.no_alloc   = false;
+            params.dynamic    = dynamic;
 
             struct ggml_context* ctx = ggml_init(params);
             if (!ctx) {
@@ -4415,7 +4415,7 @@ class StableDiffusionGGML {
             struct ggml_tensor* moments = first_stage_model.encode(ctx, x);
 
             struct ggml_cgraph* vae_graph = ggml_new_graph(ctx);
-            vae_graph->order = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT;
+            vae_graph->order              = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT;
             ggml_build_forward_expand(vae_graph, moments);
 
             int64_t t0 = ggml_time_ms();
@@ -4460,26 +4460,26 @@ class StableDiffusionGGML {
     // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
     ggml_tensor* get_first_stage_encoding(ggml_context* res_ctx, ggml_tensor* moments) {
         // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
-        ggml_tensor* latent = ggml_new_tensor_4d(res_ctx, moments->type, moments->ne[0],
-                                                 moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
+        ggml_tensor* latent       = ggml_new_tensor_4d(res_ctx, moments->type, moments->ne[0],
+                                                       moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
         struct ggml_tensor* noise = ggml_dup_tensor(res_ctx, latent);
         ggml_tensor_set_f32_randn(noise, rng);
         // noise = load_tensor_from_file(res_ctx, "noise.bin");
         {
-            float mean = 0;
+            float mean   = 0;
             float logvar = 0;
-            float value = 0;
-            float std_ = 0;
+            float value  = 0;
+            float std_   = 0;
             for (int i = 0; i < latent->ne[3]; i++) {
                 for (int j = 0; j < latent->ne[2]; j++) {
                     for (int k = 0; k < latent->ne[1]; k++) {
                         for (int l = 0; l < latent->ne[0]; l++) {
-                            mean = ggml_tensor_get_f32(moments, l, k, j, i);
+                            mean   = ggml_tensor_get_f32(moments, l, k, j, i);
                             logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
                             logvar = std::max(-30.0f, std::min(logvar, 20.0f));
-                            std_ = std::exp(0.5f * logvar);
-                            value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
-                            value = value * scale_factor;
+                            std_   = std::exp(0.5f * logvar);
+                            value  = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
+                            value  = value * scale_factor;
                             // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
                             ggml_tensor_set_f32(latent, value, l, k, j, i);
                         }
@@ -4491,8 +4491,8 @@ class StableDiffusionGGML {
     }
 
     ggml_tensor* decode_first_stage(ggml_context* res_ctx, ggml_tensor* z) {
-        int64_t W = z->ne[0];
-        int64_t H = z->ne[1];
+        int64_t W                      = z->ne[0];
+        int64_t H                      = z->ne[1];
         struct ggml_tensor* result_img = NULL;
         struct ggml_cplan cplan;
 
@@ -4507,10 +4507,10 @@ class StableDiffusionGGML {
         size_t ctx_size = 10 * 1024 * 1024;  // 10MB
         {
             struct ggml_init_params params;
-            params.mem_size = ctx_size;
+            params.mem_size   = ctx_size;
             params.mem_buffer = NULL;
-            params.no_alloc = true;
-            params.dynamic = dynamic;
+            params.no_alloc   = true;
+            params.dynamic    = dynamic;
 
             struct ggml_context* ctx = ggml_init(params);
             if (!ctx) {
@@ -4522,7 +4522,7 @@ class StableDiffusionGGML {
             ctx_size += ggml_used_mem(ctx) + ggml_used_mem_of_data(ctx);
 
             struct ggml_cgraph* vae_graph = ggml_build_forward_ctx(ctx, img);
-            cplan = ggml_graph_plan(vae_graph, n_threads);
+            cplan                         = ggml_graph_plan(vae_graph, n_threads);
 
             ctx_size += cplan.work_size;
             LOG_DEBUG("vae context need %.2fMB static memory, with work_size needing %.2fMB",
@@ -4534,10 +4534,10 @@ class StableDiffusionGGML {
 
         {
             struct ggml_init_params params;
-            params.mem_size = ctx_size;
+            params.mem_size   = ctx_size;
             params.mem_buffer = NULL;
-            params.no_alloc = false;
-            params.dynamic = dynamic;
+            params.no_alloc   = false;
+            params.dynamic    = dynamic;
 
             struct ggml_context* ctx = ggml_init(params);
             if (!ctx) {
@@ -4548,7 +4548,7 @@ class StableDiffusionGGML {
             struct ggml_tensor* img = first_stage_model.decode(ctx, z);
 
             struct ggml_cgraph* vae_graph = ggml_new_graph(ctx);
-            vae_graph->order = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT;
+            vae_graph->order              = GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT;
             ggml_build_forward_expand(vae_graph, img);
 
             int64_t t0 = ggml_time_ms();
@@ -4621,9 +4621,9 @@ std::vector<uint8_t> StableDiffusion::txt2img(std::string prompt,
     struct ggml_init_params params;
     params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10M
     params.mem_size += width * height * 3 * sizeof(float) * 2;
-    params.mem_buffer = NULL;
-    params.no_alloc = false;
-    params.dynamic = false;
+    params.mem_buffer        = NULL;
+    params.no_alloc          = false;
+    params.dynamic           = false;
     struct ggml_context* ctx = ggml_init(params);
     if (!ctx) {
         LOG_ERROR("ggml_init() failed");
@@ -4636,7 +4636,7 @@ std::vector<uint8_t> StableDiffusion::txt2img(std::string prompt,
     sd->rng->manual_seed(seed);
 
     // extract and remote lora
-    auto result_pair = extract_and_remove_lora(prompt);
+    auto result_pair                                = extract_and_remove_lora(prompt);
     std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
     for (auto& kv : lora_f2m) {
         LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
@@ -4650,8 +4650,8 @@ std::vector<uint8_t> StableDiffusion::txt2img(std::string prompt,
     int64_t t1 = ggml_time_ms();
     LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
 
-    t0 = ggml_time_ms();
-    ggml_tensor* c = sd->get_learned_condition(ctx, prompt);
+    t0                     = ggml_time_ms();
+    ggml_tensor* c         = sd->get_learned_condition(ctx, prompt);
     struct ggml_tensor* uc = NULL;
     if (cfg_scale != 1.0) {
         uc = sd->get_learned_condition(ctx, negative_prompt);
@@ -4665,9 +4665,9 @@ std::vector<uint8_t> StableDiffusion::txt2img(std::string prompt,
         sd->clip_params_ctx = NULL;
     }
 
-    int C = 4;
-    int W = width / 8;
-    int H = height / 8;
+    int C                   = 4;
+    int W                   = width / 8;
+    int H                   = height / 8;
     struct ggml_tensor* x_t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, W, H, C, 1);
     ggml_tensor_set_f32_randn(x_t, sd->rng);
 
@@ -4728,7 +4728,7 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
     LOG_INFO("img2img %dx%d", width, height);
 
     std::vector<float> sigmas = sd->denoiser->schedule->get_sigmas(sample_steps);
-    size_t t_enc = static_cast<size_t>(sample_steps * strength);
+    size_t t_enc              = static_cast<size_t>(sample_steps * strength);
     LOG_INFO("target t_enc is %zu steps", t_enc);
     std::vector<float> sigma_sched;
     sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
@@ -4736,9 +4736,9 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
     struct ggml_init_params params;
     params.mem_size = static_cast<size_t>(10 * 1024) * 1024;  // 10M
     params.mem_size += width * height * 3 * sizeof(float) * 2;
-    params.mem_buffer = NULL;
-    params.no_alloc = false;
-    params.dynamic = false;
+    params.mem_buffer        = NULL;
+    params.no_alloc          = false;
+    params.dynamic           = false;
     struct ggml_context* ctx = ggml_init(params);
     if (!ctx) {
         LOG_ERROR("ggml_init() failed");
@@ -4751,7 +4751,7 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
     sd->rng->manual_seed(seed);
 
     // extract and remote lora
-    auto result_pair = extract_and_remove_lora(prompt);
+    auto result_pair                                = extract_and_remove_lora(prompt);
     std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
     for (auto& kv : lora_f2m) {
         LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
@@ -4768,8 +4768,8 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
     ggml_tensor* init_img = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, width, height, 3, 1);
     image_vec_to_ggml(init_img_vec, init_img);
 
-    t0 = ggml_time_ms();
-    ggml_tensor* moments = sd->encode_first_stage(ctx, init_img);
+    t0                       = ggml_time_ms();
+    ggml_tensor* moments     = sd->encode_first_stage(ctx, init_img);
     ggml_tensor* init_latent = sd->get_first_stage_encoding(ctx, moments);
     // print_ggml_tensor(init_latent);
     t1 = ggml_time_ms();
@@ -4777,7 +4777,7 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
 
     ggml_reset_curr_max_dynamic_size();  // reset counter
 
-    ggml_tensor* c = sd->get_learned_condition(ctx, prompt);
+    ggml_tensor* c         = sd->get_learned_condition(ctx, prompt);
     struct ggml_tensor* uc = NULL;
     if (cfg_scale != 1.0) {
         uc = sd->get_learned_condition(ctx, negative_prompt);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index ed8cd1f..84d261e 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -38,15 +38,15 @@ enum Schedule {
 class StableDiffusionGGML;
 
 class StableDiffusion {
-   private:
+private:
     std::shared_ptr<StableDiffusionGGML> sd;
 
-   public:
-    StableDiffusion(int n_threads = -1,
-                    bool vae_decode_only = false,
+public:
+    StableDiffusion(int n_threads                = -1,
+                    bool vae_decode_only         = false,
                     bool free_params_immediately = false,
-                    std::string lora_model_dir = "",
-                    RNGType rng_type = STD_DEFAULT_RNG);
+                    std::string lora_model_dir   = "",
+                    RNGType rng_type             = STD_DEFAULT_RNG);
     bool load_from_file(const std::string& file_path, Schedule d = DEFAULT);
     std::vector<uint8_t> txt2img(
         std::string prompt,