fix: repair flash attention support (#386)

* repair flash attention in _ext this does not fix the currently broken fa behind the define, which is only used by VAE Co-authored-by: FSSRepo <FSSRepo@users.noreply.github.com> * make flash attention in the diffusion model a runtime flag no support for sd3 or video * remove old flash attention option and switch vae over to attn_ext * update docs * format code --------- Co-authored-by: FSSRepo <FSSRepo@users.noreply.github.com> Co-authored-by: leejet <leejet714@gmail.com>
2024-11-23 05:39:08 +01:00
parent ea9b647080
commit 1c168d98a5
17 changed files with 334 additions and 314 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,7 +29,6 @@ option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
 option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
 option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
@@ -61,11 +60,6 @@ if (SD_HIPBLAS)
    endif()
 endif ()
 if(SD_FLASH_ATTN)
    message("-- Use Flash Attention for memory optimization")
    add_definitions(-DSD_USE_FLASH_ATTENTION)
 endif()
 set(SD_LIB stable-diffusion)
 file(GLOB SD_LIB_SOURCES 
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Inference of Stable Diffusion and Flux in pure C/C++
 - Full CUDA, Metal, Vulkan and SYCL backend for GPU acceleration.
 - Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
    - No need to convert to `.ggml` or `.gguf` anymore!
- Flash Attention for memory usage optimization (only cpu for now)
+- Flash Attention for memory usage optimization
 - Original `txt2img` and `img2img` mode
 - Negative prompt
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
@@ -182,11 +182,21 @@ Example of text2img by using SYCL backend:
 ##### Using Flash Attention
-Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUBLAS is enabled because the kernel implementation is missing.
+Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
 eg.:
 - flux 768x768 ~600mb
 - SD2 768x768 ~1400mb
 For most backends, it slows things down, but for cuda it generally speeds it up too.
 At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
 Run by adding `--diffusion-fa` to the arguments and watch for:
 ```
-cmake .. -DSD_FLASH_ATTN=ON
+[INFO ] stable-diffusion.cpp:312  - Using flash attention in the diffusion model
-cmake --build . --config Release
+```
 and the compute buffer shrink in the debug log:
 ```
 [DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
 ```
 ### Run
@@ -240,6 +250,9 @@ arguments:
  --vae-tiling                       process vae in tiles to reduce memory usage
  --vae-on-cpu                       keep vae in cpu (for low vram)
  --clip-on-cpu                      keep clip in cpu (for low vram)
  --diffusion-fa                     use flash attention in the diffusion model (for low vram)
                                     Might lower quality, since it implies converting k and v to f16.
                                     This might crash if it is not supported by the backend.
  --control-net-cpu                  keep controlnet in cpu (for low vram)
  --canny                            apply canny preprocessor (edge detection)
  --color                            Colors the logging tags according to level
--- a/clip.hpp
+++ b/clip.hpp
@@ -344,7 +344,6 @@ public:
    }
    std::string clean_up_tokenization(std::string& text) {
        std::regex pattern(R"( ,)");
        // Replace " ," with ","
        std::string result = std::regex_replace(text, pattern, ",");
@@ -768,8 +767,7 @@ public:
        blocks["post_layernorm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
    }
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, 
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values, bool return_pooled = true) {
                                bool return_pooled = true) {
        // pixel_values: [N, num_channels, image_size, image_size]
        auto embeddings     = std::dynamic_pointer_cast<CLIPVisionEmbeddings>(blocks["embeddings"]);
        auto pre_layernorm  = std::dynamic_pointer_cast<LayerNorm>(blocks["pre_layernorm"]);
--- a/common.hpp
+++ b/common.hpp
@@ -245,16 +245,19 @@ protected:
    int64_t context_dim;
    int64_t n_head;
    int64_t d_head;
    bool flash_attn;
 public:
    CrossAttention(int64_t query_dim,
                   int64_t context_dim,
                   int64_t n_head,
-                   int64_t d_head)
+                   int64_t d_head,
                   bool flash_attn = false)
        : n_head(n_head),
          d_head(d_head),
          query_dim(query_dim),
-          context_dim(context_dim) {
+          context_dim(context_dim),
          flash_attn(flash_attn) {
        int64_t inner_dim = d_head * n_head;
        blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
@@ -283,7 +286,7 @@ public:
        auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
        auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
-        x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false);  // [N, n_token, inner_dim]
+        x = ggml_nn_attention_ext(ctx, q, k, v, n_head, NULL, false, false, flash_attn);  // [N, n_token, inner_dim]
        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
        return x;
@@ -301,15 +304,16 @@ public:
                          int64_t n_head,
                          int64_t d_head,
                          int64_t context_dim,
-                          bool ff_in = false)
+                          bool ff_in      = false,
                          bool flash_attn = false)
        : n_head(n_head), d_head(d_head), ff_in(ff_in) {
        // disable_self_attn is always False
        // disable_temporal_crossattention is always False
        // switch_temporal_ca_to_sa is always False
        // inner_dim is always None or equal to dim
        // gated_ff is always True
-        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
+        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head, flash_attn));
-        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
+        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn));
        blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
@@ -374,7 +378,8 @@ public:
                       int64_t n_head,
                       int64_t d_head,
                       int64_t depth,
-                       int64_t context_dim)
+                       int64_t context_dim,
                       bool flash_attn = false)
        : in_channels(in_channels),
          n_head(n_head),
          d_head(d_head),
@@ -388,7 +393,7 @@ public:
        for (int i = 0; i < depth; i++) {
            std::string name = "transformer_blocks." + std::to_string(i);
-            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim));
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
        }
        blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -4,7 +4,6 @@
 #include "clip.hpp"
 #include "t5.hpp"
 struct SDCondition {
    struct ggml_tensor* c_crossattn = NULL;  // aka context
    struct ggml_tensor* c_vector    = NULL;  // aka y
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -32,8 +32,9 @@ struct UNetModel : public DiffusionModel {
    UNetModel(ggml_backend_t backend,
              ggml_type wtype,
-              SDVersion version = VERSION_SD1)
+              SDVersion version = VERSION_SD1,
-        : unet(backend, wtype, version) {
+              bool flash_attn   = false)
        : unet(backend, wtype, version, flash_attn) {
    }
    void alloc_params_buffer() {
@@ -133,8 +134,9 @@ struct FluxModel : public DiffusionModel {
    FluxModel(ggml_backend_t backend,
              ggml_type wtype,
-              SDVersion version = VERSION_FLUX_DEV)
+              SDVersion version = VERSION_FLUX_DEV,
-        : flux(backend, wtype, version) {
+              bool flash_attn   = false)
        : flux(backend, wtype, version, flash_attn) {
    }
    void alloc_params_buffer() {
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -116,6 +116,7 @@ struct SDParams {
    bool normalize_input          = false;
    bool clip_on_cpu              = false;
    bool vae_on_cpu               = false;
    bool diffusion_flash_attn     = false;
    bool canny_preprocess         = false;
    bool color                    = false;
    int upscale_repeats           = 1;
@@ -151,6 +152,7 @@ void print_params(SDParams params) {
    printf("    clip on cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
    printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
    printf("    vae decoder on cpu:%s\n", params.vae_on_cpu ? "true" : "false");
    printf("    diffusion flash attention:%s\n", params.diffusion_flash_attn ? "true" : "false");
    printf("    strength(control): %.2f\n", params.control_strength);
    printf("    prompt:            %s\n", params.prompt.c_str());
    printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
@@ -227,6 +229,9 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
    printf("  --vae-on-cpu                       keep vae in cpu (for low vram)\n");
    printf("  --clip-on-cpu                      keep clip in cpu (for low vram)\n");
    printf("  --diffusion-fa                     use flash attention in the diffusion model (for low vram)\n");
    printf("                                     Might lower quality, since it implies converting k and v to f16.\n");
    printf("                                     This might crash if it is not supported by the backend.\n");
    printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
    printf("  --canny                            apply canny preprocessor (edge detection)\n");
    printf("  --color                            Colors the logging tags according to level\n");
@@ -477,6 +482,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
            params.clip_on_cpu = true;  // will slow down get_learned_condiotion but necessary for low MEM GPUs
        } else if (arg == "--vae-on-cpu") {
            params.vae_on_cpu = true;  // will slow down latent decoding but necessary for low MEM GPUs
        } else if (arg == "--diffusion-fa") {
            params.diffusion_flash_attn = true;  // can reduce MEM significantly
        } else if (arg == "--canny") {
            params.canny_preprocess = true;
        } else if (arg == "-b" || arg == "--batch-count") {
@@ -868,7 +875,8 @@ int main(int argc, const char* argv[]) {
                                  params.schedule,
                                  params.clip_on_cpu,
                                  params.control_net_cpu,
-                                  params.vae_on_cpu);
+                                  params.vae_on_cpu,
                                  params.diffusion_flash_attn);
    if (sd_ctx == NULL) {
        printf("new_sd_ctx_t failed\n");
--- a/flux.hpp
+++ b/flux.hpp
@@ -115,25 +115,28 @@ namespace Flux {
                                                    struct ggml_tensor* q,
                                                    struct ggml_tensor* k,
                                                    struct ggml_tensor* v,
-                                                    struct ggml_tensor* pe) {
+                                                    struct ggml_tensor* pe,
                                                    bool flash_attn) {
        // q,k,v: [N, L, n_head, d_head]
        // pe: [L, d_head/2, 2, 2]
        // return: [N, L, n_head*d_head]
        q = apply_rope(ctx, q, pe);  // [N*n_head, L, d_head]
        k = apply_rope(ctx, k, pe);  // [N*n_head, L, d_head]
-        auto x = ggml_nn_attention_ext(ctx, q, k, v, v->ne[1], NULL, false, true);  // [N, L, n_head*d_head]
+        auto x = ggml_nn_attention_ext(ctx, q, k, v, v->ne[1], NULL, false, true, flash_attn);  // [N, L, n_head*d_head]
        return x;
    }
    struct SelfAttention : public GGMLBlock {
    public:
        int64_t num_heads;
        bool flash_attn;
    public:
        SelfAttention(int64_t dim,
                      int64_t num_heads = 8,
-                      bool qkv_bias     = false)
+                      bool qkv_bias     = false,
                      bool flash_attn   = false)
            : num_heads(num_heads) {
            int64_t head_dim = dim / num_heads;
            blocks["qkv"]    = std::shared_ptr<GGMLBlock>(new Linear(dim, dim * 3, qkv_bias));
@@ -168,7 +171,7 @@ namespace Flux {
            // pe: [n_token, d_head/2, 2, 2]
            // return [N, n_token, dim]
            auto qkv = pre_attention(ctx, x);                                   // q,k,v: [N, n_token, n_head, d_head]
-            x        = attention(ctx, qkv[0], qkv[1], qkv[2], pe);  // [N, n_token, dim]
+            x        = attention(ctx, qkv[0], qkv[1], qkv[2], pe, flash_attn);  // [N, n_token, dim]
            x        = post_attention(ctx, x);                                  // [N, n_token, dim]
            return x;
        }
@@ -237,15 +240,19 @@ namespace Flux {
    }
    struct DoubleStreamBlock : public GGMLBlock {
        bool flash_attn;
    public:
        DoubleStreamBlock(int64_t hidden_size,
                          int64_t num_heads,
                          float mlp_ratio,
-                          bool qkv_bias = false) {
+                          bool qkv_bias   = false,
                          bool flash_attn = false)
            : flash_attn(flash_attn) {
            int64_t mlp_hidden_dim = hidden_size * mlp_ratio;
            blocks["img_mod"]      = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
            blocks["img_norm1"]    = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["img_attn"]     = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias));
+            blocks["img_attn"]     = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, flash_attn));
            blocks["img_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
            blocks["img_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
@@ -254,7 +261,7 @@ namespace Flux {
            blocks["txt_mod"]   = std::shared_ptr<GGMLBlock>(new Modulation(hidden_size, true));
            blocks["txt_norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
-            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias));
+            blocks["txt_attn"]  = std::shared_ptr<GGMLBlock>(new SelfAttention(hidden_size, num_heads, qkv_bias, flash_attn));
            blocks["txt_norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size, 1e-6f, false));
            blocks["txt_mlp.0"] = std::shared_ptr<GGMLBlock>(new Linear(hidden_size, mlp_hidden_dim));
@@ -316,7 +323,7 @@ namespace Flux {
            auto k = ggml_concat(ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
            auto v = ggml_concat(ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
-            auto attn         = attention(ctx, q, k, v, pe);                          // [N, n_txt_token + n_img_token, n_head*d_head]
+            auto attn         = attention(ctx, q, k, v, pe, flash_attn);              // [N, n_txt_token + n_img_token, n_head*d_head]
            attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));  // [n_txt_token + n_img_token, N, hidden_size]
            auto txt_attn_out = ggml_view_3d(ctx,
                                             attn,
@@ -364,13 +371,15 @@ namespace Flux {
        int64_t num_heads;
        int64_t hidden_size;
        int64_t mlp_hidden_dim;
        bool flash_attn;
    public:
        SingleStreamBlock(int64_t hidden_size,
                          int64_t num_heads,
                          float mlp_ratio = 4.0f,
-                          float qk_scale  = 0.f)
+                          float qk_scale  = 0.f,
-            : hidden_size(hidden_size), num_heads(num_heads) {
+                          bool flash_attn = false)
            : hidden_size(hidden_size), num_heads(num_heads), flash_attn(flash_attn) {
            int64_t head_dim = hidden_size / num_heads;
            float scale      = qk_scale;
            if (scale <= 0.f) {
@@ -433,7 +442,7 @@ namespace Flux {
            auto v           = ggml_reshape_4d(ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);  // [N, n_token, n_head, d_head]
            q                = norm->query_norm(ctx, q);
            k                = norm->key_norm(ctx, k);
-            auto attn        = attention(ctx, q, k, v, pe);  // [N, n_token, hidden_size]
+            auto attn        = attention(ctx, q, k, v, pe, flash_attn);  // [N, n_token, hidden_size]
            auto attn_mlp = ggml_concat(ctx, attn, ggml_gelu_inplace(ctx, mlp), 0);  // [N, n_token, hidden_size + mlp_hidden_dim]
            auto output   = linear2->forward(ctx, attn_mlp);                         // [N, n_token, hidden_size]
@@ -492,6 +501,7 @@ namespace Flux {
        int theta                   = 10000;
        bool qkv_bias               = true;
        bool guidance_embed         = true;
        bool flash_attn             = true;
    };
    struct Flux : public GGMLBlock {
@@ -646,13 +656,16 @@ namespace Flux {
                blocks["double_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new DoubleStreamBlock(params.hidden_size,
                                                                                                                params.num_heads,
                                                                                                                params.mlp_ratio,
-                                                                                                                params.qkv_bias));
+                                                                                                                params.qkv_bias,
                                                                                                                params.flash_attn));
            }
            for (int i = 0; i < params.depth_single_blocks; i++) {
                blocks["single_blocks." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new SingleStreamBlock(params.hidden_size,
                                                                                                                params.num_heads,
-                                                                                                                params.mlp_ratio));
+                                                                                                                params.mlp_ratio,
                                                                                                                0.f,
                                                                                                                params.flash_attn));
            }
            blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new LastLayer(params.hidden_size, 1, out_channels));
@@ -817,8 +830,10 @@ namespace Flux {
        FluxRunner(ggml_backend_t backend,
                   ggml_type wtype,
-                   SDVersion version = VERSION_FLUX_DEV)
+                   SDVersion version = VERSION_FLUX_DEV,
                   bool flash_attn   = false)
            : GGMLRunner(backend, wtype) {
            flux_params.flash_attn = flash_attn;
            if (version == VERSION_FLUX_SCHNELL) {
                flux_params.guidance_embed = false;
            }
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -666,32 +666,6 @@ __STATIC_INLINE__ std::vector<struct ggml_tensor*> split_qkv(struct ggml_context
    return {q, k, v};
 }
 // q: [N * n_head, n_token, d_head]
 // k: [N * n_head, n_k, d_head]
 // v: [N * n_head, d_head, n_k]
 // return: [N * n_head, n_token, d_head]
 __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention(struct ggml_context* ctx,
                                                        struct ggml_tensor* q,
                                                        struct ggml_tensor* k,
                                                        struct ggml_tensor* v,
                                                        bool mask = false) {
 #if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL) && !defined(SD_USE_VULKAN) && !defined(SD_USE_SYCL)
    struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false);  // [N * n_head, n_token, d_head]
 #else
    float d_head = (float)q->ne[0];
    struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, n_token, n_k]
    kq                     = ggml_scale_inplace(ctx, kq, 1.0f / sqrt(d_head));
    if (mask) {
        kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
    }
    kq = ggml_soft_max_inplace(ctx, kq);
    struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, n_token, d_head]
 #endif
    return kqv;
 }
 // q: [N, L_q, C] or [N*n_head, L_q, d_head]
 // k: [N, L_k, C] or [N*n_head, L_k, d_head]
 // v: [N, L_k, C] or [N, L_k, n_head, d_head]
@@ -703,7 +677,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
                                                            int64_t n_head,
                                                            struct ggml_tensor* mask = NULL,
                                                            bool diag_mask_inf       = false,
-                                                            bool skip_reshape        = false) {
+                                                            bool skip_reshape        = false,
                                                            bool flash_attn          = false) {
    int64_t L_q;
    int64_t L_k;
    int64_t C;
@@ -734,13 +709,42 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
    float scale = (1.0f / sqrt((float)d_head));
-    bool use_flash_attn = false;
+    // if (flash_attn) {
-    ggml_tensor* kqv    = NULL;
+    //     LOG_DEBUG("attention_ext L_q:%d L_k:%d n_head:%d C:%d d_head:%d N:%d", L_q, L_k, n_head, C, d_head, N);
-    if (use_flash_attn) {
+    // }
    //  is there anything oddly shaped?? ping Green-Sky if you can trip this assert
    GGML_ASSERT(((L_k % 256 == 0) && L_q == L_k) || !(L_k % 256 == 0));
    bool can_use_flash_attn = true;
    can_use_flash_attn      = can_use_flash_attn && L_k % 256 == 0;
    can_use_flash_attn      = can_use_flash_attn && d_head % 64 == 0;  // double check
    // cuda max d_head seems to be 256, cpu does seem to work with 512
    can_use_flash_attn = can_use_flash_attn && d_head <= 256;  // double check
    if (mask != nullptr) {
        // TODO(Green-Sky): figure out if we can bend t5 to work too
        can_use_flash_attn = can_use_flash_attn && mask->ne[2] == 1;
        can_use_flash_attn = can_use_flash_attn && mask->ne[3] == 1;
    }
    // TODO(Green-Sky): more pad or disable for funny tensor shapes
    ggml_tensor* kqv = nullptr;
    // GGML_ASSERT((flash_attn && can_use_flash_attn) || !flash_attn);
    if (can_use_flash_attn && flash_attn) {
        // LOG_DEBUG("using flash attention");
        k = ggml_cast(ctx, k, GGML_TYPE_F16);
        v = ggml_cont(ctx, ggml_permute(ctx, v, 0, 2, 1, 3));  // [N, n_head, L_k, d_head]
        v = ggml_reshape_3d(ctx, v, d_head, L_k, n_head * N);  // [N * n_head, L_k, d_head]
-        LOG_DEBUG("k->ne[1] == %d", k->ne[1]);
+        v = ggml_cast(ctx, v, GGML_TYPE_F16);
        kqv = ggml_flash_attn_ext(ctx, q, k, v, mask, scale, 0, 0);
        ggml_flash_attn_ext_set_prec(kqv, GGML_PREC_F32);
        // kqv = ggml_view_3d(ctx, kqv, d_head, n_head, L_k, kqv->nb[1], kqv->nb[2], 0);
        kqv = ggml_view_3d(ctx, kqv, d_head, n_head, L_q, kqv->nb[1], kqv->nb[2], 0);
    } else {
        v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));  // [N, n_head, d_head, L_k]
        v = ggml_reshape_3d(ctx, v, L_k, d_head, n_head * N);  // [N * n_head, d_head, L_k]
@@ -756,10 +760,12 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
        kq = ggml_soft_max_inplace(ctx, kq);
        kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, L_q, d_head]
    }
        kqv = ggml_reshape_4d(ctx, kqv, d_head, L_q, n_head, N);  // [N, n_head, L_q, d_head]
-    kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, L_q, n_head, d_head]
+        kqv = ggml_permute(ctx, kqv, 0, 2, 1, 3);                 // [N, L_q, n_head, d_head]
    }
    kqv = ggml_cont(ctx, kqv);
    kqv = ggml_reshape_3d(ctx, kqv, d_head * n_head, L_q, N);  // [N, L_q, C]
    return kqv;
@@ -1222,7 +1228,6 @@ protected:
        if (bias) {
            params["bias"] = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_features);
        }
    }
 public:
--- a/model.cpp
+++ b/model.cpp
@@ -650,7 +650,6 @@ uint16_t f8_e4m3_to_f16(uint8_t f8) {
    return ggml_fp32_to_fp16(*reinterpret_cast<const float*>(&result));
 }
 uint16_t f8_e5m2_to_f16(uint8_t fp8) {
    uint8_t sign     = (fp8 >> 7) & 0x1;
    uint8_t exponent = (fp8 >> 2) & 0x1F;
@@ -1434,7 +1433,6 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
            std::string name = zip_entry_name(zip);
            size_t pos       = name.find("data.pkl");
            if (pos != std::string::npos) {
                std::string dir = name.substr(0, pos);
                printf("ZIP %d, name = %s, dir = %s \n", i, name.c_str(), dir.c_str());
                void* pkl_data = NULL;
--- a/pmid.hpp
+++ b/pmid.hpp
@@ -6,7 +6,6 @@
 #include "clip.hpp"
 #include "lora.hpp"
 struct FuseBlock : public GGMLBlock {
    // network hparams
    int in_dim;
@@ -78,7 +77,6 @@ class QFormerPerceiver(nn.Module):
        return out
 */
 struct PMFeedForward : public GGMLBlock {
    // network hparams
    int dim;
@@ -93,7 +91,6 @@ public:
    struct ggml_tensor* forward(struct ggml_context* ctx,
                                struct ggml_tensor* x) {
        auto norm = std::dynamic_pointer_cast<LayerNorm>(blocks["0"]);
        auto ff   = std::dynamic_pointer_cast<Mlp>(blocks["1"]);
@@ -101,7 +98,6 @@ public:
        x = ff->forward(ctx, x);
        return x;
    }
 };
 struct PerceiverAttention : public GGMLBlock {
@@ -112,7 +108,6 @@ struct PerceiverAttention : public GGMLBlock {
 public:
    PerceiverAttention(int dim, int dim_h = 64, int h = 8)
        : scale(powf(dim_h, -0.5)), dim_head(dim_h), heads(h) {
        int inner_dim    = dim_head * heads;
        blocks["norm1"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
        blocks["norm2"]  = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
@@ -143,18 +138,15 @@ public:
    std::vector<struct ggml_tensor*> chunk_half(struct ggml_context* ctx,
                                                struct ggml_tensor* x) {
        auto tlo = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], 0);
        auto tli = ggml_view_4d(ctx, x, x->ne[0] / 2, x->ne[1], x->ne[2], x->ne[3], x->nb[1], x->nb[2], x->nb[3], x->nb[0] * x->ne[0] / 2);
        return {ggml_cont(ctx, tlo),
                ggml_cont(ctx, tli)};
    }
    struct ggml_tensor* forward(struct ggml_context* ctx,
                                struct ggml_tensor* x,
                                struct ggml_tensor* latents) {
        // x (torch.Tensor): image features
        //     shape (b, n1, D)
        // latent (torch.Tensor): latent features
@@ -206,6 +198,7 @@ public:
 struct FacePerceiverResampler : public GGMLBlock {
    // network hparams
    int depth;
 public:
    FacePerceiverResampler(int dim           = 768,
                           int d             = 4,
@@ -258,10 +251,8 @@ struct QFormerPerceiver : public GGMLBlock {
    int cross_attention_dim;
    bool use_residul;
 public:
-    QFormerPerceiver(int id_embeddings_dim, int cross_attention_d, int num_t, int embedding_dim=1024, 
+    QFormerPerceiver(int id_embeddings_dim, int cross_attention_d, int num_t, int embedding_dim = 1024, bool use_r = true, int ratio = 4)
                     bool use_r=true, int ratio=4)
        : cross_attention_dim(cross_attention_d), num_tokens(num_t), use_residul(use_r) {
        blocks["token_proj"]          = std::shared_ptr<GGMLBlock>(new Mlp(id_embeddings_dim,
                                                                           id_embeddings_dim * ratio,
@@ -346,8 +337,6 @@ class FacePerceiverResampler(torch.nn.Module):
        return self.norm_out(latents)
 */
 /*
 def FeedForward(dim, mult=4):
@@ -417,9 +406,6 @@ class PerceiverAttention(nn.Module):
 */
 struct FuseModule : public GGMLBlock {
    // network hparams
    int embed_dim;
@@ -555,7 +541,6 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
 };
 struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionModelProjection {
    int cross_attention_dim;
    int num_tokens;
@@ -578,7 +563,6 @@ struct PhotoMakerIDEncoder_CLIPInsightfaceExtendtokenBlock : public CLIPVisionMo
        blocks["qformer_perceiver"] = std::shared_ptr<GGMLBlock>(new QFormerPerceiver(id_embeddings_dim,
                                                                                      cross_attention_dim,
                                                                                      num_tokens));
    }
    /*
@@ -639,8 +623,7 @@ public:
    std::vector<float> zeros_right;
 public:
-    PhotoMakerIDEncoder(ggml_backend_t backend, ggml_type wtype, SDVersion version = VERSION_SDXL, 
+    PhotoMakerIDEncoder(ggml_backend_t backend, ggml_type wtype, SDVersion version = VERSION_SDXL, PMVersion pm_v = VERSION_1, float sty = 20.f)
                        PMVersion pm_v = VERSION_1, float sty = 20.f)
        : GGMLRunner(backend, wtype),
          version(version),
          pm_version(pm_v),
@@ -660,13 +643,11 @@ public:
        return pm_version;
    }
    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
        if (pm_version == VERSION_1)
            id_encoder.get_param_tensors(tensors, prefix);
        else if (pm_version == VERSION_2)
            id_encoder2.get_param_tensors(tensors, prefix);
    }
    struct ggml_cgraph* build_graph(  // struct ggml_allocr* allocr,
@@ -791,9 +772,7 @@ public:
    }
 };
 struct PhotoMakerIDEmbed : public GGMLRunner {
    std::map<std::string, struct ggml_tensor*> tensors;
    std::string file_path;
    ModelLoader* model_loader;
@@ -805,8 +784,7 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
                      ModelLoader* ml,
                      const std::string& file_path = "",
                      const std::string& prefix    = "")
-        : file_path(file_path), GGMLRunner(backend, wtype),
+        : file_path(file_path), GGMLRunner(backend, wtype), model_loader(ml) {
        model_loader(ml) {
        if (!model_loader->init_from_file(file_path, prefix)) {
            load_failed = true;
        }
@@ -856,7 +834,6 @@ struct PhotoMakerIDEmbed : public GGMLRunner {
        return true;
    }
    struct ggml_tensor* get() {
        std::map<std::string, struct ggml_tensor*>::iterator pos;
        pos = tensors.find("pmid.id_embeds");
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -156,7 +156,8 @@ public:
                        schedule_t schedule,
                        bool clip_on_cpu,
                        bool control_net_cpu,
-                        bool vae_on_cpu) {
+                        bool vae_on_cpu,
                        bool diffusion_flash_attn) {
        use_tiny_autoencoder = taesd_path.size() > 0;
 #ifdef SD_USE_CUBLAS
        LOG_DEBUG("Using CUDA backend");
@@ -185,13 +186,7 @@ public:
            LOG_DEBUG("Using CPU backend");
            backend = ggml_backend_cpu_init();
        }
-#ifdef SD_USE_FLASH_ATTENTION
+
 #if defined(SD_USE_CUBLAS) || defined(SD_USE_METAL) || defined(SD_USE_SYCL) || defined(SD_USE_VULKAN)
        LOG_WARN("Flash Attention not supported with GPU Backend");
 #else
        LOG_INFO("Flash Attention enabled");
 #endif
 #endif
        ModelLoader model_loader;
        vae_tiling = vae_tiling_;
@@ -325,19 +320,25 @@ public:
                LOG_INFO("CLIP: Using CPU backend");
                clip_backend = ggml_backend_cpu_init();
            }
            if (diffusion_flash_attn) {
                LOG_INFO("Using flash attention in the diffusion model");
            }
            if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B || version == VERSION_SD3_5_2B) {
                if (diffusion_flash_attn) {
                    LOG_WARN("flash attention in this diffusion model is currently unsupported!");
                }
                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, conditioner_wtype);
                diffusion_model  = std::make_shared<MMDiTModel>(backend, diffusion_model_wtype, version);
            } else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL || version == VERSION_FLUX_LITE) {
                cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, conditioner_wtype);
-                diffusion_model  = std::make_shared<FluxModel>(backend, diffusion_model_wtype, version);
+                diffusion_model  = std::make_shared<FluxModel>(backend, diffusion_model_wtype, version, diffusion_flash_attn);
            } else {
                if (id_embeddings_path.find("v2") != std::string::npos) {
                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, conditioner_wtype, embeddings_path, version, VERSION_2);
                } else {
                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, conditioner_wtype, embeddings_path, version);
                }
-                diffusion_model  = std::make_shared<UNetModel>(backend, diffusion_model_wtype, version);
+                diffusion_model = std::make_shared<UNetModel>(backend, diffusion_model_wtype, version, diffusion_flash_attn);
            }
            cond_stage_model->alloc_params_buffer();
            cond_stage_model->get_param_tensors(tensors);
@@ -1081,7 +1082,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
                     enum schedule_t s,
                     bool keep_clip_on_cpu,
                     bool keep_control_net_cpu,
-                     bool keep_vae_on_cpu) {
+                     bool keep_vae_on_cpu,
                     bool diffusion_flash_attn) {
    sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
    if (sd_ctx == NULL) {
        return NULL;
@@ -1122,7 +1124,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
                                    s,
                                    keep_clip_on_cpu,
                                    keep_control_net_cpu,
-                                    keep_vae_on_cpu)) {
+                                    keep_vae_on_cpu,
                                    diffusion_flash_attn)) {
        delete sd_ctx->sd;
        sd_ctx->sd = NULL;
        free(sd_ctx);
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -142,7 +142,8 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
                            enum schedule_t s,
                            bool keep_clip_on_cpu,
                            bool keep_control_net_cpu,
-                            bool keep_vae_on_cpu);
+                            bool keep_vae_on_cpu,
                            bool diffusion_flash_attn);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
--- a/unet.hpp
+++ b/unet.hpp
@@ -183,7 +183,7 @@ public:
    int model_channels  = 320;
    int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD
-    UnetModelBlock(SDVersion version = VERSION_SD1)
+    UnetModelBlock(SDVersion version = VERSION_SD1, bool flash_attn = false)
        : version(version) {
        if (version == VERSION_SD2) {
            context_dim       = 1024;
@@ -242,7 +242,7 @@ public:
            if (version == VERSION_SVD) {
                return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim);
            } else {
-                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
+                return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, flash_attn);
            }
        };
@@ -533,8 +533,9 @@ struct UNetModelRunner : public GGMLRunner {
    UNetModelRunner(ggml_backend_t backend,
                    ggml_type wtype,
-                    SDVersion version = VERSION_SD1)
+                    SDVersion version = VERSION_SD1,
-        : GGMLRunner(backend, wtype), unet(version) {
+                    bool flash_attn   = false)
        : GGMLRunner(backend, wtype), unet(version, flash_attn) {
        unet.init(params_ctx, wtype);
    }
--- a/util.cpp
+++ b/util.cpp
@@ -293,7 +293,6 @@ std::vector<std::string> splitString(const std::string& str, char delimiter) {
    return result;
 }
 sd_image_t* preprocess_id_image(sd_image_t* img) {
    int shortest_edge   = 224;
    int size            = shortest_edge;
--- a/vae.hpp
+++ b/vae.hpp
@@ -100,9 +100,11 @@ public:
        k      = ggml_reshape_3d(ctx, k, c, h * w, n);              // [N, h * w, in_channels]
        auto v = v_proj->forward(ctx, h_);                          // [N, in_channels, h, w]
-        v      = ggml_reshape_3d(ctx, v, h * w, c, n);  // [N, in_channels, h * w]
+        v      = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));  // [N, h, w, in_channels]
        v      = ggml_reshape_3d(ctx, v, c, h * w, n);              // [N, h * w, in_channels]
-        h_ = ggml_nn_attention(ctx, q, k, v, false);  // [N, h * w, in_channels]
+        // h_ = ggml_nn_attention(ctx, q, k, v, false);  // [N, h * w, in_channels]
        h_ = ggml_nn_attention_ext(ctx, q, k, v, 1, nullptr, false, true, false);
        h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
        h_ = ggml_reshape_4d(ctx, h_, w, h, c, n);               // [N, in_channels, h, w]