refactor: reorganize code and use c api (#133)

2024-01-01 16:22:18 +08:00
parent b139434b57
commit 2e79a82f85
22 changed files with 530311 additions and 49428 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,7 +60,8 @@ add_subdirectory(thirdparty)

 set(SD_LIB stable-diffusion)

-add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp model.h model.cpp util.h util.cpp)
+add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp model.h model.cpp util.h util.cpp upscaler.cpp
+             ggml_extend.hpp clip.hpp common.hpp unet.hpp tae.hpp esrgan.hpp lora.hpp denoiser.hpp rng.hpp rng_philox.hpp)
 target_link_libraries(${SD_LIB} PUBLIC ggml zip)
 target_include_directories(${SD_LIB} PUBLIC . thirdparty)
 target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
--- a/clip.hpp
+++ b/clip.hpp
@@ -0,0 +1,998 @@
+#ifndef __CLIP_HPP__
+#define __CLIP_HPP__
+
+#include "ggml_extend.hpp"
+
+/*================================================== CLIPTokenizer ===================================================*/
+
+std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
+    std::regex re("<lora:([^:]+):([^>]+)>");
+    std::smatch matches;
+    std::unordered_map<std::string, float> filename2multiplier;
+
+    while (std::regex_search(text, matches, re)) {
+        std::string filename = matches[1].str();
+        float multiplier     = std::stof(matches[2].str());
+
+        text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
+
+        if (multiplier == 0.f) {
+            continue;
+        }
+
+        if (filename2multiplier.find(filename) == filename2multiplier.end()) {
+            filename2multiplier[filename] = multiplier;
+        } else {
+            filename2multiplier[filename] += multiplier;
+        }
+    }
+
+    return std::make_pair(filename2multiplier, text);
+}
+
+const std::string UNK_TOKEN = "<|endoftext|>";
+const std::string BOS_TOKEN = "<|startoftext|>";
+const std::string EOS_TOKEN = "<|endoftext|>";
+const std::string PAD_TOEKN = "<|endoftext|>";
+
+const int UNK_TOKEN_ID = 49407;
+const int BOS_TOKEN_ID = 49406;
+const int EOS_TOKEN_ID = 49407;
+const int PAD_TOKEN_ID = 49407;
+
+std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
+    std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
+    std::set<int> byte_set;
+    for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
+        byte_set.insert(b);
+        byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
+    }
+    for (int b = 161; b <= 172; ++b) {
+        byte_set.insert(b);
+        byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
+    }
+    for (int b = 174; b <= 255; ++b) {
+        byte_set.insert(b);
+        byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
+    }
+    int n = 0;
+    for (int b = 0; b < 256; ++b) {
+        if (byte_set.find(b) == byte_set.end()) {
+            byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(n + 256)));
+            ++n;
+        }
+    }
+    // LOG_DEBUG("byte_unicode_pairs %d", byte_unicode_pairs.size());
+    return byte_unicode_pairs;
+}
+
+// Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
+class CLIPTokenizer {
+private:
+    SDVersion version = VERSION_1_x;
+    std::map<int, std::u32string> byte_encoder;
+    std::map<std::u32string, int> encoder;
+    std::map<std::pair<std::u32string, std::u32string>, int> bpe_ranks;
+    std::regex pat;
+
+    static std::string strip(const std::string& str) {
+        std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
+        std::string::size_type end   = str.find_last_not_of(" \t\n\r\v\f");
+
+        if (start == std::string::npos) {
+            // String contains only whitespace characters
+            return "";
+        }
+
+        return str.substr(start, end - start + 1);
+    }
+
+    static std::string whitespace_clean(std::string text) {
+        text = std::regex_replace(text, std::regex(R"(\s+)"), " ");
+        text = strip(text);
+        return text;
+    }
+
+    static std::set<std::pair<std::u32string, std::u32string>> get_pairs(const std::vector<std::u32string>& subwords) {
+        std::set<std::pair<std::u32string, std::u32string>> pairs;
+        if (subwords.size() == 0) {
+            return pairs;
+        }
+        std::u32string prev_subword = subwords[0];
+        for (int i = 1; i < subwords.size(); i++) {
+            std::u32string subword = subwords[i];
+            std::pair<std::u32string, std::u32string> pair(prev_subword, subword);
+            pairs.insert(pair);
+            prev_subword = subword;
+        }
+        return pairs;
+    }
+
+public:
+    CLIPTokenizer(SDVersion version = VERSION_1_x)
+        : version(version) {}
+
+    void load_from_merges(const std::string& merges_utf8_str) {
+        auto byte_unicode_pairs = bytes_to_unicode();
+        byte_encoder            = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
+        // for (auto & pair: byte_unicode_pairs) {
+        //     std::cout << pair.first << ": " << pair.second << std::endl;
+        // }
+        std::vector<std::u32string> merges;
+        size_t start = 0;
+        size_t pos;
+        std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str);
+        while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) {
+            merges.push_back(merges_utf32_str.substr(start, pos - start));
+            start = pos + 1;
+        }
+        // LOG_DEBUG("merges size %llu", merges.size());
+        GGML_ASSERT(merges.size() == 48895);
+        merges = std::vector<std::u32string>(merges.begin() + 1, merges.end());
+        std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
+        for (const auto& merge : merges) {
+            size_t space_pos = merge.find(' ');
+            merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
+            // LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
+        }
+        std::vector<std::u32string> vocab;
+        for (const auto& pair : byte_unicode_pairs) {
+            vocab.push_back(pair.second);
+        }
+        for (const auto& pair : byte_unicode_pairs) {
+            vocab.push_back(pair.second + utf8_to_utf32("</w>"));
+        }
+        for (const auto& merge : merge_pairs) {
+            vocab.push_back(merge.first + merge.second);
+        }
+        vocab.push_back(utf8_to_utf32("<|startoftext|>"));
+        vocab.push_back(utf8_to_utf32("<|endoftext|>"));
+        LOG_DEBUG("vocab size: %llu", vocab.size());
+        int i = 0;
+        for (const auto& token : vocab) {
+            encoder[token] = i++;
+        }
+
+        int rank = 0;
+        for (const auto& merge : merge_pairs) {
+            bpe_ranks[merge] = rank++;
+        }
+    };
+
+    std::u32string bpe(const std::u32string& token) {
+        std::vector<std::u32string> word;
+
+        for (int i = 0; i < token.size() - 1; i++) {
+            word.emplace_back(1, token[i]);
+        }
+        word.push_back(token.substr(token.size() - 1) + utf8_to_utf32("</w>"));
+
+        std::set<std::pair<std::u32string, std::u32string>> pairs = get_pairs(word);
+
+        if (pairs.empty()) {
+            return token + utf8_to_utf32("</w>");
+        }
+
+        while (true) {
+            auto min_pair_iter = std::min_element(pairs.begin(),
+                                                  pairs.end(),
+                                                  [&](const std::pair<std::u32string, std::u32string>& a,
+                                                      const std::pair<std::u32string, std::u32string>& b) {
+                                                      if (bpe_ranks.find(a) == bpe_ranks.end()) {
+                                                          return false;
+                                                      } else if (bpe_ranks.find(b) == bpe_ranks.end()) {
+                                                          return true;
+                                                      }
+                                                      return bpe_ranks.at(a) < bpe_ranks.at(b);
+                                                  });
+
+            const std::pair<std::u32string, std::u32string>& bigram = *min_pair_iter;
+
+            if (bpe_ranks.find(bigram) == bpe_ranks.end()) {
+                break;
+            }
+
+            std::u32string first  = bigram.first;
+            std::u32string second = bigram.second;
+            std::vector<std::u32string> new_word;
+            int32_t i = 0;
+
+            while (i < word.size()) {
+                auto it = std::find(word.begin() + i, word.end(), first);
+                if (it == word.end()) {
+                    new_word.insert(new_word.end(), word.begin() + i, word.end());
+                    break;
+                }
+                new_word.insert(new_word.end(), word.begin() + i, it);
+                i = static_cast<int32_t>(std::distance(word.begin(), it));
+
+                if (word[i] == first && i < static_cast<int32_t>(word.size()) - 1 && word[i + 1] == second) {
+                    new_word.push_back(first + second);
+                    i += 2;
+                } else {
+                    new_word.push_back(word[i]);
+                    i += 1;
+                }
+            }
+
+            word = new_word;
+
+            if (word.size() == 1) {
+                break;
+            }
+            pairs = get_pairs(word);
+        }
+
+        std::u32string result;
+        for (int i = 0; i < word.size(); i++) {
+            result += word[i];
+            if (i != word.size() - 1) {
+                result += utf8_to_utf32(" ");
+            }
+        }
+
+        return result;
+    }
+
+    std::vector<int> tokenize(std::string text, size_t max_length = 0, bool padding = false) {
+        std::vector<int32_t> tokens = encode(text);
+        tokens.insert(tokens.begin(), BOS_TOKEN_ID);
+        if (max_length > 0) {
+            if (tokens.size() > max_length - 1) {
+                tokens.resize(max_length - 1);
+                tokens.push_back(EOS_TOKEN_ID);
+            } else {
+                tokens.push_back(EOS_TOKEN_ID);
+                if (padding) {
+                    int pad_token_id = PAD_TOKEN_ID;
+                    if (version == VERSION_2_x) {
+                        pad_token_id = 0;
+                    }
+                    tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
+                }
+            }
+        }
+        return tokens;
+    }
+
+    std::vector<int> encode(std::string text) {
+        std::string original_text = text;
+        std::vector<int32_t> bpe_tokens;
+        text = whitespace_clean(text);
+        std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
+
+        std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
+                       std::regex::icase);
+
+        std::smatch matches;
+        std::string str = text;
+        std::vector<std::string> token_strs;
+        while (std::regex_search(str, matches, pat)) {
+            for (auto& token : matches) {
+                std::string token_str = token.str();
+                std::u32string utf32_token;
+                for (int i = 0; i < token_str.length(); i++) {
+                    char b = token_str[i];
+                    utf32_token += byte_encoder[b];
+                }
+                auto bpe_strs = bpe(utf32_token);
+                size_t start  = 0;
+                size_t pos;
+                while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) {
+                    auto bpe_str = bpe_strs.substr(start, pos - start);
+                    bpe_tokens.push_back(encoder[bpe_str]);
+                    token_strs.push_back(utf32_to_utf8(bpe_str));
+
+                    start = pos + 1;
+                }
+                auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start);
+                bpe_tokens.push_back(encoder[bpe_str]);
+                token_strs.push_back(utf32_to_utf8(bpe_str));
+            }
+            str = matches.suffix();
+        }
+        std::stringstream ss;
+        ss << "[";
+        for (auto token : token_strs) {
+            ss << "\"" << token << "\", ";
+        }
+        ss << "]";
+        LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
+        return bpe_tokens;
+    }
+};
+
+// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
+//
+// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+// Accepted tokens are:
+//   (abc) - increases attention to abc by a multiplier of 1.1
+//   (abc:3.12) - increases attention to abc by a multiplier of 3.12
+//   [abc] - decreases attention to abc by a multiplier of 1.1
+//   \( - literal character '('
+//   \[ - literal character '['
+//   \) - literal character ')'
+//   \] - literal character ']'
+//   \\ - literal character '\'
+//   anything else - just text
+//
+// >>> parse_prompt_attention('normal text')
+// [['normal text', 1.0]]
+// >>> parse_prompt_attention('an (important) word')
+// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+// >>> parse_prompt_attention('(unbalanced')
+// [['unbalanced', 1.1]]
+// >>> parse_prompt_attention('\(literal\]')
+// [['(literal]', 1.0]]
+// >>> parse_prompt_attention('(unnecessary)(parens)')
+// [['unnecessaryparens', 1.1]]
+// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+// [['a ', 1.0],
+//  ['house', 1.5730000000000004],
+//  [' ', 1.1],
+//  ['on', 1.0],
+//  [' a ', 1.1],
+//  ['hill', 0.55],
+//  [', sun, ', 1.1],
+//  ['sky', 1.4641000000000006],
+//  ['.', 1.1]]
+std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
+    std::vector<std::pair<std::string, float>> res;
+    std::vector<int> round_brackets;
+    std::vector<int> square_brackets;
+
+    float round_bracket_multiplier  = 1.1f;
+    float square_bracket_multiplier = 1 / 1.1f;
+
+    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
+    std::regex re_break(R"(\s*\bBREAK\b\s*)");
+
+    auto multiply_range = [&](int start_position, float multiplier) {
+        for (int p = start_position; p < res.size(); ++p) {
+            res[p].second *= multiplier;
+        }
+    };
+
+    std::smatch m;
+    std::string remaining_text = text;
+
+    while (std::regex_search(remaining_text, m, re_attention)) {
+        std::string text   = m[0];
+        std::string weight = m[1];
+
+        if (text == "(") {
+            round_brackets.push_back((int)res.size());
+        } else if (text == "[") {
+            square_brackets.push_back((int)res.size());
+        } else if (!weight.empty()) {
+            if (!round_brackets.empty()) {
+                multiply_range(round_brackets.back(), std::stof(weight));
+                round_brackets.pop_back();
+            }
+        } else if (text == ")" && !round_brackets.empty()) {
+            multiply_range(round_brackets.back(), round_bracket_multiplier);
+            round_brackets.pop_back();
+        } else if (text == "]" && !square_brackets.empty()) {
+            multiply_range(square_brackets.back(), square_bracket_multiplier);
+            square_brackets.pop_back();
+        } else if (text == "\\(") {
+            res.push_back({text.substr(1), 1.0f});
+        } else {
+            res.push_back({text, 1.0f});
+        }
+
+        remaining_text = m.suffix();
+    }
+
+    for (int pos : round_brackets) {
+        multiply_range(pos, round_bracket_multiplier);
+    }
+
+    for (int pos : square_brackets) {
+        multiply_range(pos, square_bracket_multiplier);
+    }
+
+    if (res.empty()) {
+        res.push_back({"", 1.0f});
+    }
+
+    int i = 0;
+    while (i + 1 < res.size()) {
+        if (res[i].second == res[i + 1].second) {
+            res[i].first += res[i + 1].first;
+            res.erase(res.begin() + i + 1);
+        } else {
+            ++i;
+        }
+    }
+
+    return res;
+}
+
+/*================================================ FrozenCLIPEmbedder ================================================*/
+
+struct ResidualAttentionBlock {
+    int32_t n_head;
+    int32_t d_model;
+    int32_t hidden_size;  // n_head * d_model
+    int32_t intermediate_size;
+
+    // attention
+    struct ggml_tensor* q_w;  // [hidden_size, hidden_size]
+    struct ggml_tensor* q_b;  // [hidden_size, ]
+    struct ggml_tensor* k_w;  // [hidden_size, hidden_size]
+    struct ggml_tensor* k_b;  // [hidden_size, ]
+    struct ggml_tensor* v_w;  // [hidden_size, hidden_size]
+    struct ggml_tensor* v_b;  // [hidden_size, ]
+
+    struct ggml_tensor* out_w;  // [hidden_size, hidden_size]
+    struct ggml_tensor* out_b;  // [hidden_size, ]
+
+    // layer norm 1
+    struct ggml_tensor* ln1_w;  // [hidden_size, ]
+    struct ggml_tensor* ln1_b;  // [hidden_size, ]
+
+    // mlp
+    struct ggml_tensor* fc1_w;  // [intermediate_size, hidden_size]
+    struct ggml_tensor* fc1_b;  // [intermediate_size, ]
+
+    struct ggml_tensor* fc2_w;  // [hidden_size, intermediate_size]
+    struct ggml_tensor* fc2_b;  // [hidden_size, ]
+
+    // layer norm 2
+    struct ggml_tensor* ln2_w;  // [hidden_size, ]
+    struct ggml_tensor* ln2_b;  // [hidden_size, ]
+
+    struct ggml_tensor* attn_scale;  // [hidden_size, ]
+
+    size_t calculate_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += 4 * hidden_size * hidden_size * ggml_type_sizef(wtype);        // q_w/k_w/v_w/out_w
+        mem_size += 8 * hidden_size * ggml_type_sizef(GGML_TYPE_F32);              // q_b/k_b/v_b/out_b/ln1_w/ln1_b/ln2_w/ln2_b
+        mem_size += 2 * hidden_size * intermediate_size * ggml_type_sizef(wtype);  // fc1_w/fc2_w
+        mem_size += intermediate_size * ggml_type_sizef(GGML_TYPE_F32);            // fc1_b
+        mem_size += hidden_size * ggml_type_sizef(GGML_TYPE_F32);                  // fc2_b
+        mem_size += ggml_type_sizef(GGML_TYPE_F32);                                // attn_scale
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
+        ln1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        ln1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        q_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
+        q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        k_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
+        k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        v_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
+        v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        out_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
+        out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        fc1_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, intermediate_size);
+        fc1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, intermediate_size);
+
+        fc2_w = ggml_new_tensor_2d(ctx, wtype, intermediate_size, hidden_size);
+        fc2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        ln2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+        ln2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        attn_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+        ggml_allocr_alloc(alloc, attn_scale);
+        float scale = 1.0f / sqrt((float)d_model);
+        ggml_backend_tensor_set(attn_scale, &scale, 0, sizeof(scale));
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "self_attn.q_proj.weight"]   = q_w;
+        tensors[prefix + "self_attn.q_proj.bias"]     = q_b;
+        tensors[prefix + "self_attn.k_proj.weight"]   = k_w;
+        tensors[prefix + "self_attn.k_proj.bias"]     = k_b;
+        tensors[prefix + "self_attn.v_proj.weight"]   = v_w;
+        tensors[prefix + "self_attn.v_proj.bias"]     = v_b;
+        tensors[prefix + "self_attn.out_proj.weight"] = out_w;
+        tensors[prefix + "self_attn.out_proj.bias"]   = out_b;
+
+        tensors[prefix + "layer_norm1.weight"] = ln1_w;
+        tensors[prefix + "layer_norm1.bias"]   = ln1_b;
+
+        tensors[prefix + "layer_norm2.weight"] = ln2_w;
+        tensors[prefix + "layer_norm2.bias"]   = ln2_b;
+
+        tensors[prefix + "mlp.fc1.weight"] = fc1_w;
+        tensors[prefix + "mlp.fc1.bias"]   = fc1_b;
+
+        tensors[prefix + "mlp.fc2.weight"] = fc2_w;
+        tensors[prefix + "mlp.fc2.bias"]   = fc2_b;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, n_token, hidden_size]
+        int64_t N           = x->ne[2];
+        int64_t n_token     = x->ne[1];
+        int64_t hidden_size = n_head * d_model;
+
+        struct ggml_tensor* r = x;
+
+        // layer norm 1
+        x = ggml_nn_layer_norm(ctx, x, ln1_w, ln1_b);
+        // self-attention
+        {
+            struct ggml_tensor* q = ggml_nn_linear(ctx, x, q_w, q_b);
+            q                     = ggml_scale_inplace(ctx, q, attn_scale);
+            q                     = ggml_reshape_4d(ctx, q, d_model, n_head, n_token, N);   // [N, n_token, n_head, d_model]
+            q                     = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));       // [N, n_head, n_token, d_model]
+            q                     = ggml_reshape_3d(ctx, q, d_model, n_token, n_head * N);  // [N * n_head, n_token, d_model]
+
+            struct ggml_tensor* k = ggml_nn_linear(ctx, x, k_w, k_b);
+            k                     = ggml_reshape_4d(ctx, k, d_model, n_head, n_token, N);  // [N, n_token, n_head, d_model]
+            k                     = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3));      // [N, n_head, n_token, d_model]
+            k                     = ggml_reshape_3d(ctx, k, d_model, n_token, n_head);     // [N * n_head, n_token, d_model]
+
+            struct ggml_tensor* v = ggml_nn_linear(ctx, x, v_w, v_b);
+            v                     = ggml_reshape_4d(ctx, v, d_model, n_head, n_token, N);   // [N, n_token, n_head, d_model]
+            v                     = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3));       // [N, n_head, d_model, n_token]
+            v                     = ggml_reshape_3d(ctx, v, n_token, d_model, n_head * N);  // [N * n_head, d_model, n_token]
+
+            struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q);  // [N * n_head, n_token, n_token]
+
+            kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
+            kq = ggml_soft_max_inplace(ctx, kq);
+
+            struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq);  // [N * n_head, n_token, d_model]
+            kqv                     = ggml_reshape_4d(ctx, kqv, d_model, n_token, n_head, N);
+            kqv                     = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));  // [N, n_token, n_head, d_model]
+
+            x = ggml_reshape_2d(ctx, kqv, d_model * n_head, n_token * N);  // // [N * n_token, d_model * n_head]
+        }
+
+        // attention output
+        x = ggml_nn_linear(ctx, x, out_w, out_b);
+
+        // residual
+        x = ggml_add(ctx, x, r);
+        r = x;
+
+        // layer norm 2
+        x = ggml_nn_layer_norm(ctx, x, ln2_w, ln2_b);
+
+        // mlp
+        x = ggml_nn_linear(ctx, x, fc1_w, fc1_b);
+
+        if (hidden_size == 1024 || hidden_size == 1280) {  // SD 2.x
+            x = ggml_gelu_inplace(ctx, x);
+        } else {  // SD 1.x
+            x = ggml_gelu_quick_inplace(ctx, x);
+        }
+
+        x = ggml_nn_linear(ctx, x, fc2_w, fc2_b);
+
+        // residual 2
+        x = ggml_add(ctx, x, r);
+        return x;
+    }
+};
+
+// OPENAI_CLIP_VIT_L_14: https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
+// OPEN_CLIP_VIT_H_14: https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/config.json
+// OPEN_CLIP_VIT_BIGG_14: https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/blob/main/config.json (CLIPTextModelWithProjection)
+// SDXL CLIPModel
+// CLIPTextModelWithProjection seems optional
+
+enum CLIPVersion {
+    OPENAI_CLIP_VIT_L_14,   // SD 1.x and SDXL
+    OPEN_CLIP_VIT_H_14,     // SD 2.x
+    OPEN_CLIP_VIT_BIGG_14,  // SDXL
+};
+
+struct CLIPTextModel {
+    CLIPVersion version = OPENAI_CLIP_VIT_L_14;
+    // network hparams
+    int32_t vocab_size              = 49408;
+    int32_t max_position_embeddings = 77;
+    int32_t hidden_size             = 768;   // 1024 for OPEN_CLIP_VIT_H_14
+    int32_t intermediate_size       = 3072;  // 4096 for OPEN_CLIP_VIT_H_14
+    int32_t n_head                  = 12;    // num_attention_heads, 16 for OPEN_CLIP_VIT_H_14
+    int32_t num_hidden_layers       = 12;    // 24 for OPEN_CLIP_VIT_H_14
+    int32_t layer_idx               = 11;
+    int32_t projection_dim          = 1280;  // only for OPEN_CLIP_VIT_BIGG_14
+    bool with_final_ln              = true;
+
+    // embeddings
+    struct ggml_tensor* position_ids;
+    struct ggml_tensor* token_embed_weight;
+    struct ggml_tensor* position_embed_weight;
+
+    // transformer
+    std::vector<ResidualAttentionBlock> resblocks;
+    struct ggml_tensor* final_ln_w;
+    struct ggml_tensor* final_ln_b;
+
+    struct ggml_tensor* text_projection;
+
+    CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
+                  int clip_skip       = -1,
+                  bool with_final_ln  = true)
+        : version(version), with_final_ln(with_final_ln) {
+        if (version == OPEN_CLIP_VIT_H_14) {
+            hidden_size       = 1024;
+            intermediate_size = 4096;
+            n_head            = 16;
+            num_hidden_layers = 24;
+        } else if (version == OPEN_CLIP_VIT_BIGG_14) {  // CLIPTextModelWithProjection
+            hidden_size       = 1280;
+            intermediate_size = 5120;
+            n_head            = 20;
+            num_hidden_layers = 32;
+        }
+        set_clip_skip(clip_skip);
+        resblocks.resize(num_hidden_layers);
+        set_resblocks_hp_params();
+    }
+
+    void set_clip_skip(int clip_skip) {
+        if (clip_skip > 0) {
+            layer_idx = num_hidden_layers - clip_skip;
+        }
+    }
+
+    void set_resblocks_hp_params() {
+        int d_model = hidden_size / n_head;  // 64 / SDXL is 40 for CLIPTextModelWithProjection
+        for (int i = 0; i < num_hidden_layers; i++) {
+            resblocks[i].d_model           = d_model;
+            resblocks[i].n_head            = n_head;
+            resblocks[i].hidden_size       = hidden_size;
+            resblocks[i].intermediate_size = intermediate_size;
+        }
+    }
+
+    size_t calculate_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += hidden_size * max_position_embeddings * ggml_type_sizef(GGML_TYPE_I32);  // position_ids
+        mem_size += hidden_size * vocab_size * ggml_type_sizef(wtype);                       // token_embed_weight
+        mem_size += hidden_size * max_position_embeddings * ggml_type_sizef(wtype);          // position_embed_weight
+        for (int i = 0; i < num_hidden_layers; i++) {
+            mem_size += resblocks[i].calculate_mem_size(wtype);
+        }
+        mem_size += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32);  // final_ln_w/b
+        if (version == OPEN_CLIP_VIT_BIGG_14) {
+            mem_size += hidden_size * projection_dim * ggml_type_sizef(GGML_TYPE_F32);  // text_projection
+        }
+        return static_cast<size_t>(mem_size);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "embeddings.token_embedding.weight"]    = token_embed_weight;
+        tensors[prefix + "embeddings.position_embedding.weight"] = position_embed_weight;
+        tensors[prefix + "final_layer_norm.weight"]              = final_ln_w;
+        tensors[prefix + "final_layer_norm.bias"]                = final_ln_b;
+        for (int i = 0; i < num_hidden_layers; i++) {
+            std::string name = prefix + "encoder.layers." + std::to_string(i) + ".";
+            resblocks[i].map_by_name(tensors, prefix + "encoder.layers." + std::to_string(i) + ".");
+        }
+        if (version == OPEN_CLIP_VIT_BIGG_14) {
+            tensors[prefix + "text_projection"] = text_projection;
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx0, struct ggml_tensor* input_ids, size_t max_token_idx = 0, bool return_pooled = false) {
+        // input_ids: [N, n_token]
+        GGML_ASSERT(input_ids->ne[0] <= position_ids->ne[0]);
+
+        // token_embedding + position_embedding
+        struct ggml_tensor* x;
+        x = ggml_add(ctx0,
+                     ggml_get_rows(ctx0, token_embed_weight, input_ids),
+                     ggml_get_rows(ctx0,
+                                   position_embed_weight,
+                                   ggml_view_1d(ctx0, position_ids, input_ids->ne[0], 0)));  // [N, n_token, hidden_size]
+
+        // transformer
+        for (int i = 0; i < num_hidden_layers; i++) {
+            if (!return_pooled && i == layer_idx + 1) {
+                // LOG_DEBUG("layer %d", i);
+                break;
+            }
+            x = resblocks[i].forward(ctx0, x);  // [N, n_token, hidden_size]
+        }
+
+        // final layer norm
+        if (return_pooled || with_final_ln) {
+            x = ggml_nn_layer_norm(ctx0, x, final_ln_w, final_ln_b);
+        }
+
+        if (return_pooled) {
+            // ggml_tensor* idx = ggml_argmax(ctx0, input_ids);
+            // ggml_tensor* pooled = ggml_get_rows(ctx0, x, idx);
+            // LOG_DEBUG("max_token_idx: %u %u", max_token_idx, x->nb[1]);
+            ggml_tensor* pooled = ggml_view_1d(ctx0, x, hidden_size, x->nb[1] * max_token_idx);
+            pooled              = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, text_projection)), pooled);
+            return pooled;
+        }
+
+        return x;  // [N, n_token, hidden_size]
+    }
+
+    void init_params(ggml_context* ctx, ggml_backend_t backend, ggml_type wtype, ggml_allocr* alloc) {
+        position_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, max_position_embeddings);
+
+        token_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, vocab_size);
+
+        position_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, max_position_embeddings);
+
+        for (int i = 0; i < num_hidden_layers; i++) {
+            resblocks[i].init_params(ctx, alloc, wtype);
+        }
+
+        final_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        final_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
+
+        if (version == OPEN_CLIP_VIT_BIGG_14) {
+            text_projection = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, projection_dim, hidden_size);
+        }
+
+        // alloc all tensors linked to this context
+        for (struct ggml_tensor* t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->data == NULL) {
+                ggml_allocr_alloc(alloc, t);
+            }
+        }
+
+        if (ggml_backend_is_cpu(backend)) {
+            for (int i = 0; i < max_position_embeddings; i++) {
+                ggml_set_i32_1d(position_ids, i, i);
+            }
+        } else {
+            std::vector<int> pos_temp;
+            for (int i = 0; i < max_position_embeddings; i++) {
+                pos_temp.push_back(i);
+            }
+            ggml_backend_tensor_set(position_ids, pos_temp.data(), 0, ggml_nbytes(position_ids));
+        }
+    }
+};
+
+// ldm.modules.encoders.modules.FrozenCLIPEmbedder
+// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
+struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
+    SDVersion version = VERSION_1_x;
+    CLIPTokenizer tokenizer;
+    CLIPTextModel text_model;
+    CLIPTextModel text_model2;
+
+    FrozenCLIPEmbedderWithCustomWords(SDVersion version = VERSION_1_x, int clip_skip = -1)
+        : version(version), tokenizer(version) {
+        name = "clip";
+        if (clip_skip <= 0) {
+            clip_skip = 1;
+            if (version == VERSION_2_x || version == VERSION_XL) {
+                clip_skip = 2;
+            }
+        }
+        if (version == VERSION_1_x) {
+            text_model = CLIPTextModel(OPENAI_CLIP_VIT_L_14, clip_skip);
+        } else if (version == VERSION_2_x) {
+            text_model = CLIPTextModel(OPEN_CLIP_VIT_H_14, clip_skip);
+        } else if (version == VERSION_XL) {
+            text_model  = CLIPTextModel(OPENAI_CLIP_VIT_L_14, clip_skip, false);
+            text_model2 = CLIPTextModel(OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+        }
+    }
+
+    void set_clip_skip(int clip_skip) {
+        text_model.set_clip_skip(clip_skip);
+        if (version == VERSION_XL) {
+            text_model2.set_clip_skip(clip_skip);
+        }
+    }
+
+    size_t calculate_mem_size() {
+        size_t mem_size = text_model.calculate_mem_size(wtype);
+        if (version == VERSION_XL) {
+            mem_size += text_model2.calculate_mem_size(wtype);
+        }
+        return mem_size;
+    }
+
+    size_t get_num_tensors() {
+        size_t num_tensors = (3 + 2 + 37 * text_model.num_hidden_layers);
+        if (version == VERSION_XL) {
+            num_tensors += (3 + 2 + 37 * text_model2.num_hidden_layers);
+        }
+        return num_tensors;
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        text_model.map_by_name(tensors, prefix + "transformer.text_model.");
+        if (version == VERSION_XL) {
+            text_model2.map_by_name(tensors, prefix + "1.transformer.text_model.");
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx0, struct ggml_tensor* input_ids, struct ggml_tensor* input_ids2, size_t max_token_idx = 0, bool return_pooled = false) {
+        if (return_pooled) {
+            return text_model2.forward(ctx0, input_ids2, max_token_idx, return_pooled);
+        }
+        auto hidden_states = text_model.forward(ctx0, input_ids);  // [N, n_token, hidden_size]
+        // LOG_DEBUG("hidden_states: %d %d %d %d %d", hidden_states->n_dims, hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
+        if (version == VERSION_XL) {
+            hidden_states = ggml_reshape_4d(ctx0,
+                                            hidden_states,
+                                            hidden_states->ne[0],
+                                            hidden_states->ne[1],
+                                            hidden_states->ne[2],
+                                            hidden_states->ne[3]);
+            hidden_states = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states, 2, 0, 1, 3));
+
+            auto hidden_states2 = text_model2.forward(ctx0, input_ids2);  // [N, n_token, hidden_size2]
+            hidden_states2      = ggml_reshape_4d(ctx0,
+                                             hidden_states2,
+                                             hidden_states2->ne[0],
+                                             hidden_states2->ne[1],
+                                             hidden_states2->ne[2],
+                                             hidden_states2->ne[3]);
+            hidden_states2      = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states2, 2, 0, 1, 3));
+
+            hidden_states = ggml_concat(ctx0, hidden_states, hidden_states2);  // [N, n_token, hidden_size + hidden_size2]
+
+            hidden_states = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states, 1, 2, 0, 3));
+        }
+        // LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
+        return hidden_states;
+    }
+
+    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
+                                                             bool padding = false) {
+        return tokenize(text, text_model.max_position_embeddings, padding);
+    }
+
+    std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
+                                                             size_t max_length = 0,
+                                                             bool padding      = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+            std::vector<int> curr_tokens = tokenizer.encode(curr_text);
+            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+        }
+        tokens.insert(tokens.begin(), BOS_TOKEN_ID);
+        weights.insert(weights.begin(), 1.0);
+
+        if (max_length > 0) {
+            if (tokens.size() > max_length - 1) {
+                tokens.resize(max_length - 1);
+                weights.resize(max_length - 1);
+                tokens.push_back(EOS_TOKEN_ID);
+                weights.push_back(1.0);
+            } else {
+                tokens.push_back(EOS_TOKEN_ID);
+                weights.push_back(1.0);
+                if (padding) {
+                    int pad_token_id = PAD_TOKEN_ID;
+                    if (version == VERSION_2_x) {
+                        pad_token_id = 0;
+                    }
+                    tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
+                    weights.insert(weights.end(), max_length - weights.size(), 1.0);
+                }
+            }
+        }
+
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return {tokens, weights};
+    }
+
+    void init_params() {
+        ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
+        text_model.init_params(params_ctx, backend, wtype, alloc);
+        if (version == VERSION_XL) {
+            text_model2.init_params(params_ctx, backend, wtype, alloc);
+        }
+        ggml_allocr_free(alloc);
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_allocr* allocr, std::vector<int> tokens, bool return_pooled = false) {
+        // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+        static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+        static std::vector<uint8_t> buf(buf_size);
+
+        struct ggml_init_params params = {
+            /*.mem_size   =*/buf_size,
+            /*.mem_buffer =*/buf.data(),
+            /*.no_alloc   =*/true,  // the tensors will be allocated later by ggml_allocr_alloc_graph()
+        };
+
+        struct ggml_context* ctx0 = ggml_init(params);
+
+        struct ggml_cgraph* gf = ggml_new_graph(ctx0);
+
+        struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, tokens.size());
+        ggml_allocr_alloc(allocr, input_ids);
+
+        if (!ggml_allocr_is_measure(allocr)) {
+            ggml_backend_tensor_set(input_ids, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids));
+        }
+
+        struct ggml_tensor* input_ids2 = NULL;
+        size_t max_token_idx           = 0;
+        if (version == VERSION_XL) {
+            input_ids2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, tokens.size());
+            ggml_allocr_alloc(allocr, input_ids2);
+
+            auto it = std::find(tokens.begin(), tokens.end(), EOS_TOKEN_ID);
+            if (it != tokens.end()) {
+                std::fill(std::next(it), tokens.end(), 0);
+            }
+
+            max_token_idx = std::min<size_t>(std::distance(tokens.begin(), it), tokens.size() - 1);
+
+            // for (int i = 0; i < tokens.size(); i++) {
+            //     printf("%d ", tokens[i]);
+            // }
+            // printf("\n");
+
+            if (!ggml_allocr_is_measure(allocr)) {
+                ggml_backend_tensor_set(input_ids2, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids2));
+            }
+        }
+
+        struct ggml_tensor* hidden_states = forward(ctx0, input_ids, input_ids2, max_token_idx, return_pooled);
+
+        ggml_build_forward_expand(gf, hidden_states);
+        ggml_free(ctx0);
+
+        return gf;
+    }
+
+    void alloc_compute_buffer(ggml_context* work_ctx, int max_tokens) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            bool return_pooled = false;
+            if (version == VERSION_XL) {
+                return_pooled = true;
+            }
+            return build_graph(compute_allocr, std::vector<int>(max_tokens), return_pooled);
+        };
+        GGMLModule::alloc_compute_buffer(get_graph);
+    }
+
+    void compute(const int n_threads,
+                 std::vector<int> tokens,
+                 ggml_tensor* hidden_state_output,
+                 ggml_tensor* pooled_output = NULL) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(compute_allocr, tokens, false);
+        };
+        GGMLModule::compute(get_graph, n_threads, hidden_state_output);
+
+        if (version == VERSION_XL && pooled_output != NULL) {
+            auto get_graph = [&]() -> struct ggml_cgraph* {
+                return build_graph(compute_allocr, tokens, true);
+            };
+            GGMLModule::compute(get_graph, n_threads, pooled_output);
+        }
+    }
+};
+
+#endif  // __CLIP_HPP__
--- a/common.hpp
+++ b/common.hpp
@@ -0,0 +1,86 @@
+#ifndef __COMMON_HPP__
+#define __COMMON_HPP__
+
+#include "ggml_extend.hpp"
+
+struct DownSample {
+    // hparams
+    int channels;
+    int out_channels;
+
+    // conv2d params
+    struct ggml_tensor* op_w;  // [out_channels, channels, 3, 3]
+    struct ggml_tensor* op_b;  // [out_channels,]
+
+    bool vae_downsample = false;
+
+    size_t calculate_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // op_w
+        mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // op_b
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        op_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
+        op_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        if (vae_downsample) {
+            tensors[prefix + "conv.weight"] = op_w;
+            tensors[prefix + "conv.bias"]   = op_b;
+        } else {
+            tensors[prefix + "op.weight"] = op_w;
+            tensors[prefix + "op.bias"]   = op_b;
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+        struct ggml_tensor* c = NULL;
+        if (vae_downsample) {
+            c = ggml_pad(ctx, x, 1, 1, 0, 0);
+            c = ggml_nn_conv_2d(ctx, c, op_w, op_b, 2, 2, 0, 0);
+        } else {
+            c = ggml_nn_conv_2d(ctx, x, op_w, op_b, 2, 2, 1, 1);
+        }
+        return c;  // [N, out_channels, h/2, w/2]
+    }
+};
+
+struct UpSample {
+    // hparams
+    int channels;
+    int out_channels;
+
+    // conv2d params
+    struct ggml_tensor* conv_w;  // [out_channels, channels, 3, 3]
+    struct ggml_tensor* conv_b;  // [out_channels,]
+
+    size_t calculate_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // op_w
+        mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // op_b
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
+        conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "conv.weight"] = conv_w;
+        tensors[prefix + "conv.bias"]   = conv_b;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+        x = ggml_upscale(ctx, x, 2);                              // [N, channels, h*2, w*2]
+        x = ggml_nn_conv_2d(ctx, x, conv_w, conv_b, 1, 1, 1, 1);  // [N, out_channels, h*2, w*2]
+        return x;
+    }
+};
+
+#endif  // __COMMON_HPP__
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -0,0 +1,125 @@
+#ifndef __DENOISER_HPP__
+#define __DENOISER_HPP__
+
+#include "ggml_extend.hpp"
+
+/*================================================= CompVisDenoiser ==================================================*/
+
+// Ref: https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
+
+#define TIMESTEPS 1000
+
+struct SigmaSchedule {
+    float alphas_cumprod[TIMESTEPS];
+    float sigmas[TIMESTEPS];
+    float log_sigmas[TIMESTEPS];
+
+    virtual std::vector<float> get_sigmas(uint32_t n) = 0;
+
+    float sigma_to_t(float sigma) {
+        float log_sigma = std::log(sigma);
+        std::vector<float> dists;
+        dists.reserve(TIMESTEPS);
+        for (float log_sigma_val : log_sigmas) {
+            dists.push_back(log_sigma - log_sigma_val);
+        }
+
+        int low_idx = 0;
+        for (size_t i = 0; i < TIMESTEPS; i++) {
+            if (dists[i] >= 0) {
+                low_idx++;
+            }
+        }
+        low_idx      = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
+        int high_idx = low_idx + 1;
+
+        float low  = log_sigmas[low_idx];
+        float high = log_sigmas[high_idx];
+        float w    = (low - log_sigma) / (low - high);
+        w          = std::max(0.f, std::min(1.f, w));
+        float t    = (1.0f - w) * low_idx + w * high_idx;
+
+        return t;
+    }
+
+    float t_to_sigma(float t) {
+        int low_idx     = static_cast<int>(std::floor(t));
+        int high_idx    = static_cast<int>(std::ceil(t));
+        float w         = t - static_cast<float>(low_idx);
+        float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
+        return std::exp(log_sigma);
+    }
+};
+
+struct DiscreteSchedule : SigmaSchedule {
+    std::vector<float> get_sigmas(uint32_t n) {
+        std::vector<float> result;
+
+        int t_max = TIMESTEPS - 1;
+
+        if (n == 0) {
+            return result;
+        } else if (n == 1) {
+            result.push_back(t_to_sigma((float)t_max));
+            result.push_back(0);
+            return result;
+        }
+
+        float step = static_cast<float>(t_max) / static_cast<float>(n - 1);
+        for (uint32_t i = 0; i < n; ++i) {
+            float t = t_max - step * i;
+            result.push_back(t_to_sigma(t));
+        }
+        result.push_back(0);
+        return result;
+    }
+};
+
+struct KarrasSchedule : SigmaSchedule {
+    std::vector<float> get_sigmas(uint32_t n) {
+        // These *COULD* be function arguments here,
+        // but does anybody ever bother to touch them?
+        float sigma_min = 0.1f;
+        float sigma_max = 10.f;
+        float rho       = 7.f;
+
+        std::vector<float> result(n + 1);
+
+        float min_inv_rho = pow(sigma_min, (1.f / rho));
+        float max_inv_rho = pow(sigma_max, (1.f / rho));
+        for (uint32_t i = 0; i < n; i++) {
+            // Eq. (5) from Karras et al 2022
+            result[i] = pow(max_inv_rho + (float)i / ((float)n - 1.f) * (min_inv_rho - max_inv_rho), rho);
+        }
+        result[n] = 0.;
+        return result;
+    }
+};
+
+struct Denoiser {
+    std::shared_ptr<SigmaSchedule> schedule              = std::make_shared<DiscreteSchedule>();
+    virtual std::vector<float> get_scalings(float sigma) = 0;
+};
+
+struct CompVisDenoiser : public Denoiser {
+    float sigma_data = 1.0f;
+
+    std::vector<float> get_scalings(float sigma) {
+        float c_out = -sigma;
+        float c_in  = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        return {c_out, c_in};
+    }
+};
+
+struct CompVisVDenoiser : public Denoiser {
+    float sigma_data = 1.0f;
+
+    std::vector<float> get_scalings(float sigma) {
+        float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
+        float c_out  = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        float c_in   = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
+        return {c_skip, c_out, c_in};
+    }
+};
+
+#endif  // __DENOISER_HPP__
--- a/esrgan.hpp
+++ b/esrgan.hpp
@@ -0,0 +1,423 @@
+#ifndef __ESRGAN_HPP__
+#define __ESRGAN_HPP__
+
+#include "ggml_extend.hpp"
+#include "model.h"
+
+/*
+    ===================================    ESRGAN  ===================================
+    References:
+    https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
+    https://github.com/XPixelGroup/BasicSR/blob/v1.4.2/basicsr/archs/rrdbnet_arch.py
+
+*/
+
+struct ResidualDenseBlock {
+    int num_features;
+    int num_grow_ch;
+    ggml_tensor* conv1_w;  // [num_grow_ch, num_features, 3, 3]
+    ggml_tensor* conv1_b;  // [num_grow_ch]
+
+    ggml_tensor* conv2_w;  // [num_grow_ch, num_features + num_grow_ch, 3, 3]
+    ggml_tensor* conv2_b;  // [num_grow_ch]
+
+    ggml_tensor* conv3_w;  // [num_grow_ch, num_features + 2 * num_grow_ch, 3, 3]
+    ggml_tensor* conv3_b;  // [num_grow_ch]
+
+    ggml_tensor* conv4_w;  // [num_grow_ch, num_features + 3 * num_grow_ch, 3, 3]
+    ggml_tensor* conv4_b;  // [num_grow_ch]
+
+    ggml_tensor* conv5_w;  // [num_features, num_features + 4 * num_grow_ch, 3, 3]
+    ggml_tensor* conv5_b;  // [num_features]
+
+    ResidualDenseBlock() {}
+
+    ResidualDenseBlock(int num_feat, int n_grow_ch) {
+        num_features = num_feat;
+        num_grow_ch  = n_grow_ch;
+    }
+
+    size_t calculate_mem_size() {
+        size_t mem_size = num_features * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv1_w
+        mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32);                               // conv1_b
+
+        mem_size += (num_features + num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv2_w
+        mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32);                                         // conv2_b
+
+        mem_size += (num_features + 2 * num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv3_w
+        mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32);                                             // conv3_w
+
+        mem_size += (num_features + 3 * num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv4_w
+        mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32);                                             // conv4_w
+
+        mem_size += (num_features + 4 * num_grow_ch) * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv5_w
+        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                                             // conv5_w
+
+        return mem_size;
+    }
+
+    int get_num_tensors() {
+        int num_tensors = 10;
+        return num_tensors;
+    }
+
+    void init_params(ggml_context* ctx) {
+        conv1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features, num_grow_ch);
+        conv1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
+        conv2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + num_grow_ch, num_grow_ch);
+        conv2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
+        conv3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 2 * num_grow_ch, num_grow_ch);
+        conv3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
+        conv4_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 3 * num_grow_ch, num_grow_ch);
+        conv4_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
+        conv5_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 4 * num_grow_ch, num_features);
+        conv5_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_features);
+    }
+
+    void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
+        tensors[prefix + "conv1.weight"] = conv1_w;
+        tensors[prefix + "conv1.bias"]   = conv1_b;
+
+        tensors[prefix + "conv2.weight"] = conv2_w;
+        tensors[prefix + "conv2.bias"]   = conv2_b;
+
+        tensors[prefix + "conv3.weight"] = conv3_w;
+        tensors[prefix + "conv3.bias"]   = conv3_b;
+
+        tensors[prefix + "conv4.weight"] = conv4_w;
+        tensors[prefix + "conv4.bias"]   = conv4_b;
+
+        tensors[prefix + "conv5.weight"] = conv5_w;
+        tensors[prefix + "conv5.bias"]   = conv5_b;
+    }
+
+    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
+        // x1 = self.lrelu(self.conv1(x))
+        ggml_tensor* x1 = ggml_nn_conv_2d(ctx, x, conv1_w, conv1_b, 1, 1, 1, 1);
+        x1              = ggml_leaky_relu(ctx, x1, 0.2f, true);
+
+        // x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
+        ggml_tensor* x_cat = ggml_concat(ctx, x, x1);
+        ggml_tensor* x2    = ggml_nn_conv_2d(ctx, x_cat, conv2_w, conv2_b, 1, 1, 1, 1);
+        x2                 = ggml_leaky_relu(ctx, x2, 0.2f, true);
+
+        // x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
+        x_cat           = ggml_concat(ctx, x_cat, x2);
+        ggml_tensor* x3 = ggml_nn_conv_2d(ctx, x_cat, conv3_w, conv3_b, 1, 1, 1, 1);
+        x3              = ggml_leaky_relu(ctx, x3, 0.2f, true);
+
+        // x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
+        x_cat           = ggml_concat(ctx, x_cat, x3);
+        ggml_tensor* x4 = ggml_nn_conv_2d(ctx, x_cat, conv4_w, conv4_b, 1, 1, 1, 1);
+        x4              = ggml_leaky_relu(ctx, x4, 0.2f, true);
+
+        // self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
+        x_cat           = ggml_concat(ctx, x_cat, x4);
+        ggml_tensor* x5 = ggml_nn_conv_2d(ctx, x_cat, conv5_w, conv5_b, 1, 1, 1, 1);
+
+        // return x5 * 0.2 + x
+        x5 = ggml_add(ctx, ggml_scale(ctx, x5, out_scale), x);
+        return x5;
+    }
+};
+
+struct EsrganBlock {
+    ResidualDenseBlock rd_blocks[3];
+    int num_residual_blocks = 3;
+
+    EsrganBlock() {}
+
+    EsrganBlock(int num_feat, int num_grow_ch) {
+        for (int i = 0; i < num_residual_blocks; i++) {
+            rd_blocks[i] = ResidualDenseBlock(num_feat, num_grow_ch);
+        }
+    }
+
+    int get_num_tensors() {
+        int num_tensors = 0;
+        for (int i = 0; i < num_residual_blocks; i++) {
+            num_tensors += rd_blocks[i].get_num_tensors();
+        }
+        return num_tensors;
+    }
+
+    size_t calculate_mem_size() {
+        size_t mem_size = 0;
+        for (int i = 0; i < num_residual_blocks; i++) {
+            mem_size += rd_blocks[i].calculate_mem_size();
+        }
+        return mem_size;
+    }
+
+    void init_params(ggml_context* ctx) {
+        for (int i = 0; i < num_residual_blocks; i++) {
+            rd_blocks[i].init_params(ctx);
+        }
+    }
+
+    void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
+        for (int i = 0; i < num_residual_blocks; i++) {
+            rd_blocks[i].map_by_name(tensors, prefix + "rdb" + std::to_string(i + 1) + ".");
+        }
+    }
+
+    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x) {
+        ggml_tensor* out = x;
+        for (int i = 0; i < num_residual_blocks; i++) {
+            // out = self.rdb...(x)
+            out = rd_blocks[i].forward(ctx, out_scale, out);
+        }
+        // return out * 0.2 + x
+        out = ggml_add(ctx, ggml_scale(ctx, out, out_scale), x);
+        return out;
+    }
+};
+
+struct ESRGAN : public GGMLModule {
+    int scale        = 4;  // default RealESRGAN_x4plus_anime_6B
+    int num_blocks   = 6;  // default RealESRGAN_x4plus_anime_6B
+    int in_channels  = 3;
+    int out_channels = 3;
+    int num_features = 64;   // default RealESRGAN_x4plus_anime_6B
+    int num_grow_ch  = 32;   // default RealESRGAN_x4plus_anime_6B
+    int tile_size    = 128;  // avoid cuda OOM for 4gb VRAM
+
+    ggml_tensor* conv_first_w;  // [num_features, in_channels, 3, 3]
+    ggml_tensor* conv_first_b;  // [num_features]
+
+    EsrganBlock body_blocks[6];
+    ggml_tensor* conv_body_w;  // [num_features, num_features, 3, 3]
+    ggml_tensor* conv_body_b;  // [num_features]
+
+    // upsample
+    ggml_tensor* conv_up1_w;  // [num_features, num_features, 3, 3]
+    ggml_tensor* conv_up1_b;  // [num_features]
+    ggml_tensor* conv_up2_w;  // [num_features, num_features, 3, 3]
+    ggml_tensor* conv_up2_b;  // [num_features]
+
+    ggml_tensor* conv_hr_w;    // [num_features, num_features, 3, 3]
+    ggml_tensor* conv_hr_b;    // [num_features]
+    ggml_tensor* conv_last_w;  // [out_channels, num_features, 3, 3]
+    ggml_tensor* conv_last_b;  // [out_channels]
+
+    bool decode_only = false;
+
+    ESRGAN() {
+        name = "esrgan";
+        for (int i = 0; i < num_blocks; i++) {
+            body_blocks[i] = EsrganBlock(num_features, num_grow_ch);
+        }
+    }
+
+    size_t calculate_mem_size() {
+        size_t mem_size = num_features * in_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_first_w
+        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                              // conv_first_b
+
+        for (int i = 0; i < num_blocks; i++) {
+            mem_size += body_blocks[i].calculate_mem_size();
+        }
+
+        mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_body_w
+        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                         // conv_body_w
+
+        // upsample
+        mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_up1_w
+        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                         // conv_up1_b
+
+        mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_up2_w
+        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                         // conv_up2_b
+
+        mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_hr_w
+        mem_size += num_features * ggml_type_size(GGML_TYPE_F32);                         // conv_hr_b
+
+        mem_size += out_channels * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_last_w
+        mem_size += out_channels * ggml_type_size(GGML_TYPE_F32);                         // conv_last_b
+        return mem_size;
+    }
+
+    size_t get_num_tensors() {
+        size_t num_tensors = 12;
+        for (int i = 0; i < num_blocks; i++) {
+            num_tensors += body_blocks[i].get_num_tensors();
+        }
+        return num_tensors;
+    }
+
+    void init_params() {
+        ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
+        conv_first_w       = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, in_channels, num_features);
+        conv_first_b       = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
+        conv_body_w        = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
+        conv_body_b        = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
+        conv_up1_w         = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
+        conv_up1_b         = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
+        conv_up2_w         = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
+        conv_up2_b         = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
+        conv_hr_w          = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
+        conv_hr_b          = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, num_features);
+        conv_last_w        = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 3, 3, num_features, out_channels);
+        conv_last_b        = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, out_channels);
+
+        for (int i = 0; i < num_blocks; i++) {
+            body_blocks[i].init_params(params_ctx);
+        }
+
+        // alloc all tensors linked to this context
+        for (struct ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
+            if (t->data == NULL) {
+                ggml_allocr_alloc(alloc, t);
+            }
+        }
+        ggml_allocr_free(alloc);
+    }
+
+    bool load_from_file(const std::string& file_path, ggml_backend_t backend) {
+        LOG_INFO("loading esrgan from '%s'", file_path.c_str());
+
+        if (!alloc_params_buffer(backend)) {
+            return false;
+        }
+
+        std::map<std::string, ggml_tensor*> esrgan_tensors;
+
+        // prepare memory for the weights
+        {
+            init_params();
+            map_by_name(esrgan_tensors);
+        }
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file(file_path)) {
+            LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        bool success = model_loader.load_tensors(esrgan_tensors, backend);
+
+        if (!success) {
+            LOG_ERROR("load esrgan tensors from model loader failed");
+            return false;
+        }
+
+        LOG_INFO("esrgan model loaded");
+        return success;
+    }
+
+    void map_by_name(std::map<std::string, ggml_tensor*>& tensors) {
+        tensors["conv_first.weight"] = conv_first_w;
+        tensors["conv_first.bias"]   = conv_first_b;
+
+        for (int i = 0; i < num_blocks; i++) {
+            body_blocks[i].map_by_name(tensors, "body." + std::to_string(i) + ".");
+        }
+
+        tensors["conv_body.weight"] = conv_body_w;
+        tensors["conv_body.bias"]   = conv_body_b;
+
+        tensors["conv_up1.weight"] = conv_up1_w;
+        tensors["conv_up1.bias"]   = conv_up1_b;
+        tensors["conv_up2.weight"] = conv_up2_w;
+        tensors["conv_up2.bias"]   = conv_up2_b;
+        tensors["conv_hr.weight"]  = conv_hr_w;
+        tensors["conv_hr.bias"]    = conv_hr_b;
+
+        tensors["conv_last.weight"] = conv_last_w;
+        tensors["conv_last.bias"]   = conv_last_b;
+    }
+
+    ggml_tensor* forward(ggml_context* ctx0, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
+        // feat = self.conv_first(feat)
+        auto h = ggml_nn_conv_2d(ctx0, x, conv_first_w, conv_first_b, 1, 1, 1, 1);
+
+        auto body_h = h;
+        // self.body(feat)
+        for (int i = 0; i < num_blocks; i++) {
+            body_h = body_blocks[i].forward(ctx0, out_scale, body_h);
+        }
+
+        // body_feat = self.conv_body(self.body(feat))
+        body_h = ggml_nn_conv_2d(ctx0, body_h, conv_body_w, conv_body_b, 1, 1, 1, 1);
+
+        // feat = feat + body_feat
+        h = ggml_add(ctx0, h, body_h);
+
+        // upsample
+        // feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
+        h = ggml_upscale(ctx0, h, 2);
+        h = ggml_nn_conv_2d(ctx0, h, conv_up1_w, conv_up1_b, 1, 1, 1, 1);
+        h = ggml_leaky_relu(ctx0, h, 0.2f, true);
+
+        // feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
+        h = ggml_upscale(ctx0, h, 2);
+        h = ggml_nn_conv_2d(ctx0, h, conv_up2_w, conv_up2_b, 1, 1, 1, 1);
+        h = ggml_leaky_relu(ctx0, h, 0.2f, true);
+
+        // out = self.conv_last(self.lrelu(self.conv_hr(feat)))
+        h = ggml_nn_conv_2d(ctx0, h, conv_hr_w, conv_hr_b, 1, 1, 1, 1);
+        h = ggml_leaky_relu(ctx0, h, 0.2f, true);
+
+        h = ggml_nn_conv_2d(ctx0, h, conv_last_w, conv_last_b, 1, 1, 1, 1);
+        return h;
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
+        // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+        static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+        static std::vector<uint8_t> buf(buf_size);
+
+        struct ggml_init_params params = {
+            /*.mem_size   =*/buf_size,
+            /*.mem_buffer =*/buf.data(),
+            /*.no_alloc   =*/true,  // the tensors will be allocated later by ggml_allocr_alloc_graph()
+        };
+
+        struct ggml_context* ctx0 = ggml_init(params);
+
+        struct ggml_cgraph* gf = ggml_new_graph(ctx0);
+
+        struct ggml_tensor* x_ = NULL;
+        struct ggml_tensor* os = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+        ggml_allocr_alloc(compute_allocr, os);
+        if (!ggml_allocr_is_measure(compute_allocr)) {
+            float scale = 0.2f;
+            ggml_backend_tensor_set(os, &scale, 0, sizeof(scale));
+        }
+
+        // it's performing a compute, check if backend isn't cpu
+        if (!ggml_backend_is_cpu(backend)) {
+            // pass input tensors to gpu memory
+            x_ = ggml_dup_tensor(ctx0, x);
+            ggml_allocr_alloc(compute_allocr, x_);
+
+            // pass data to device backend
+            if (!ggml_allocr_is_measure(compute_allocr)) {
+                ggml_backend_tensor_set(x_, x->data, 0, ggml_nbytes(x));
+            }
+        } else {
+            x_ = x;
+        }
+
+        struct ggml_tensor* out = forward(ctx0, os, x);
+
+        ggml_build_forward_expand(gf, out);
+        ggml_free(ctx0);
+
+        return gf;
+    }
+
+    void alloc_compute_buffer(struct ggml_tensor* x) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(x);
+        };
+        GGMLModule::alloc_compute_buffer(get_graph);
+    }
+
+    void compute(struct ggml_tensor* work_result, const int n_threads, struct ggml_tensor* x) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(x);
+        };
+        GGMLModule::compute(get_graph, n_threads, work_result);
+    }
+};
+
+#endif  // __ESRGAN_HPP__
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -1,9 +1,12 @@
 #include <stdio.h>
-#include <ctime>
+#include <string.h>
+#include <time.h>
+#include <iostream>
 #include <random>
-#include "ggml/ggml.h"
+#include <string>
+#include <vector>
+
 #include "stable-diffusion.h"
-#include "util.h"

 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
@@ -12,11 +15,6 @@
 #define STB_IMAGE_WRITE_STATIC
 #include "stb_image_write.h"

-#include <cstring>
-#include <iostream>
-#include <string>
-#include <vector>
-
 const char* rng_type_to_str[] = {
    "std_default",
    "cuda",
@@ -60,7 +58,7 @@ struct SDParams {
    std::string vae_path;
    std::string taesd_path;
    std::string esrgan_path;
-    ggml_type wtype = GGML_TYPE_COUNT;
+    sd_type_t wtype = SD_TYPE_COUNT;
    std::string lora_model_dir;
    std::string output_path = "output.png";
    std::string input_path;
@@ -73,22 +71,34 @@ struct SDParams {
    int height      = 512;
    int batch_count = 1;

-    SampleMethod sample_method = EULER_A;
-    Schedule schedule          = DEFAULT;
+    sample_method_t sample_method = EULER_A;
+    schedule_t schedule           = DEFAULT;
    int sample_steps              = 20;
    float strength                = 0.75f;
-    RNGType rng_type           = CUDA_RNG;
+    rng_type_t rng_type           = CUDA_RNG;
    int64_t seed                  = 42;
    bool verbose                  = false;
    bool vae_tiling               = false;
 };

+static std::string sd_basename(const std::string& path) {
+    size_t pos = path.find_last_of('/');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    pos = path.find_last_of('\\');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    return path;
+}
+
 void print_params(SDParams params) {
    printf("Option: \n");
    printf("    n_threads:         %d\n", params.n_threads);
    printf("    mode:              %s\n", modes_str[params.mode]);
    printf("    model_path:        %s\n", params.model_path.c_str());
-    printf("    wtype:             %s\n", params.wtype < GGML_TYPE_COUNT ? ggml_type_name(params.wtype) : "unspecified");
+    printf("    wtype:             %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");
    printf("    vae_path:          %s\n", params.vae_path.c_str());
    printf("    taesd_path:        %s\n", params.taesd_path.c_str());
    printf("    esrgan_path:       %s\n", params.esrgan_path.c_str());
@@ -208,19 +218,19 @@ void parse_args(int argc, const char** argv, SDParams& params) {
            }
            std::string type = argv[i];
            if (type == "f32") {
-                params.wtype = GGML_TYPE_F32;
+                params.wtype = SD_TYPE_F32;
            } else if (type == "f16") {
-                params.wtype = GGML_TYPE_F16;
+                params.wtype = SD_TYPE_F16;
            } else if (type == "q4_0") {
-                params.wtype = GGML_TYPE_Q4_0;
+                params.wtype = SD_TYPE_Q4_0;
            } else if (type == "q4_1") {
-                params.wtype = GGML_TYPE_Q4_1;
+                params.wtype = SD_TYPE_Q4_1;
            } else if (type == "q5_0") {
-                params.wtype = GGML_TYPE_Q5_0;
+                params.wtype = SD_TYPE_Q5_0;
            } else if (type == "q5_1") {
-                params.wtype = GGML_TYPE_Q5_1;
+                params.wtype = SD_TYPE_Q5_1;
            } else if (type == "q8_0") {
-                params.wtype = GGML_TYPE_Q8_0;
+                params.wtype = SD_TYPE_Q8_0;
            } else {
                fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
                        type.c_str());
@@ -330,7 +340,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                invalid_arg = true;
                break;
            }
-            params.schedule = (Schedule)schedule_found;
+            params.schedule = (schedule_t)schedule_found;
        } else if (arg == "-s" || arg == "--seed") {
            if (++i >= argc) {
                invalid_arg = true;
@@ -353,7 +363,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                invalid_arg = true;
                break;
            }
-            params.sample_method = (SampleMethod)sample_method_found;
+            params.sample_method = (sample_method_t)sample_method_found;
        } else if (arg == "-h" || arg == "--help") {
            print_usage(argc, argv);
            exit(0);
@@ -433,7 +443,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
    parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
    parameter_string += "Seed: " + std::to_string(seed) + ", ";
    parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
-    parameter_string += "Model: " + basename(params.model_path) + ", ";
+    parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
    parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
    parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
    if (params.schedule == KARRAS) {
@@ -444,14 +454,29 @@ std::string get_image_params(SDParams params, int64_t seed) {
    return parameter_string;
 }

+void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
+    SDParams* params = (SDParams*)data;
+    if (!params->verbose && level <= SD_LOG_DEBUG) {
+        return;
+    }
+    if (level <= SD_LOG_INFO) {
+        fprintf(stdout, log);
+        fflush(stdout);
+    } else {
+        fprintf(stderr, log);
+        fflush(stderr);
+    }
+};
+
 int main(int argc, const char* argv[]) {
    SDParams params;
    parse_args(argc, argv, params);

+    sd_set_log_callback(sd_log_cb, (void*)&params);
+
    if (params.verbose) {
        print_params(params);
-        printf("%s", sd_get_system_info().c_str());
-        set_sd_log_level(SDLogLevel::DEBUG);
+        printf("%s", sd_get_system_info());
    }

    bool vae_decode_only        = true;
@@ -482,16 +507,29 @@ int main(int argc, const char* argv[]) {
        }
    }

-    StableDiffusion sd(params.n_threads, vae_decode_only, params.taesd_path, params.esrgan_path, true, params.vae_tiling, params.lora_model_dir, params.rng_type);
+    sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
+                                  params.vae_path.c_str(),
+                                  params.taesd_path.c_str(),
+                                  params.lora_model_dir.c_str(),
+                                  vae_decode_only,
+                                  params.vae_tiling,
+                                  true,
+                                  params.n_threads,
+                                  params.wtype,
+                                  params.rng_type,
+                                  params.schedule);

-    if (!sd.load_from_file(params.model_path, params.vae_path, params.wtype, params.schedule, params.clip_skip)) {
+    if (sd_ctx == NULL) {
+        printf("new_sd_ctx_t failed\n");
        return 1;
    }

-    std::vector<uint8_t*> results;
+    sd_image_t* results;
    if (params.mode == TXT2IMG) {
-        results = sd.txt2img(params.prompt,
-                             params.negative_prompt,
+        results = txt2img(sd_ctx,
+                          params.prompt.c_str(),
+                          params.negative_prompt.c_str(),
+                          params.clip_skip,
                          params.cfg_scale,
                          params.width,
                          params.height,
@@ -500,42 +538,67 @@ int main(int argc, const char* argv[]) {
                          params.seed,
                          params.batch_count);
    } else {
-        results = sd.img2img(input_image_buffer,
-                             params.prompt,
-                             params.negative_prompt,
+        sd_image_t input_image = {(uint32_t)params.width,
+                                  (uint32_t)params.height,
+                                  3,
+                                  input_image_buffer};
+
+        results = img2img(sd_ctx,
+                          input_image,
+                          params.prompt.c_str(),
+                          params.negative_prompt.c_str(),
+                          params.clip_skip,
                          params.cfg_scale,
                          params.width,
                          params.height,
                          params.sample_method,
                          params.sample_steps,
                          params.strength,
-                             params.seed);
+                          params.seed,
+                          params.batch_count);
    }

-    if (params.esrgan_path.size() > 0) {
-        // TODO: support more ESRGAN models, making it easier to set up ESRGAN models.
-        /*  hardcoded scale factor because just RealESRGAN_x4plus_anime_6B is compatible
-            See also: https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
-
-            To avoid this, the upscaler needs to be separated from the stable diffusion pipeline.
-            However, a considerable amount of work would be required for this. It might be better
-            to opt for a complete project refactoring that facilitates the easier assignment of parameters.
-        */
-        params.width *= 4;
-        params.height *= 4;
-    }
-
-    if (results.size() == 0 || results.size() != params.batch_count) {
-        LOG_ERROR("generate failed");
+    if (results == NULL) {
+        printf("generate failed\n");
        return 1;
    }

+    int upscale_factor = 4;  // unused for RealESRGAN_x4plus_anime_6B.pth
+    if (params.esrgan_path.size() > 0) {
+        upscaler_ctx_t* upscaler_ctx = new_upscaler_ctx(params.esrgan_path.c_str(),
+                                                        params.n_threads,
+                                                        params.wtype);
+
+        if (upscaler_ctx == NULL) {
+            printf("new_upscaler_ctx failed\n");
+        } else {
+            for (int i = 0; i < params.batch_count; i++) {
+                if (results[i].data == NULL) {
+                    continue;
+                }
+                sd_image_t upscaled_image = upscale(upscaler_ctx, results[i], upscale_factor);
+                if (upscaled_image.data == NULL) {
+                    printf("upscale failed\n");
+                    continue;
+                }
+                free(results[i].data);
+                results[i] = upscaled_image;
+            }
+        }
+    }
+
    size_t last            = params.output_path.find_last_of(".");
    std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
    for (int i = 0; i < params.batch_count; i++) {
+        if (results[i].data == NULL) {
+            continue;
+        }
        std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
-        stbi_write_png(final_image_path.c_str(), params.width, params.height, 3, results[i], 0, get_image_params(params, params.seed + i).c_str());
-        LOG_INFO("save result image to '%s'", final_image_path.c_str());
+        stbi_write_png(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
+                       results[i].data, 0, get_image_params(params, params.seed + i).c_str());
+        printf("save result image to '%s'\n", final_image_path.c_str());
+        free(results[i].data);
+        results[i].data = NULL;
    }

    return 0;
--- a/format-code.sh
+++ b/format-code.sh
@@ -1,2 +1,2 @@
-clang-format -style=file -i *.cpp *.h
+clang-format -style=file -i *.cpp *.h *.hpp
 clang-format -style=file -i examples/cli/*.cpp
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -0,0 +1,642 @@
+#ifndef __GGML_EXTEND_HPP__
+#define __GGML_EXTEND_HPP__
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <algorithm>
+#include <cstring>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <random>
+#include <regex>
+#include <set>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+#include "ggml/ggml.h"
+
+#ifdef SD_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef SD_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include "rng.hpp"
+#include "util.h"
+
+#define EPS 1e-05f
+
+#ifndef __STATIC_INLINE__
+#define __STATIC_INLINE__ static inline
+#endif
+
+__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void* user_data) {
+    (void)level;
+    (void)user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
+
+__STATIC_INLINE__ void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
+    uint32_t n                        = (uint32_t)ggml_nelements(tensor);
+    std::vector<float> random_numbers = rng->randn(n);
+    for (uint32_t i = 0; i < n; i++) {
+        ggml_set_f32_1d(tensor, i, random_numbers[i]);
+    }
+}
+
+// set tensor[i, j, k, l]
+// set tensor[l]
+// set tensor[k, l]
+// set tensor[j, k, l]
+__STATIC_INLINE__ void ggml_tensor_set_f32(struct ggml_tensor* tensor, float value, int l, int k = 0, int j = 0, int i = 0) {
+    GGML_ASSERT(tensor->nb[0] == sizeof(float));
+    *(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]) = value;
+}
+
+__STATIC_INLINE__ float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
+    // float value;
+    // ggml_backend_tensor_get(tensor, &value, i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0], sizeof(float));
+    // return value;
+    GGML_ASSERT(tensor->nb[0] == sizeof(float));
+    return *(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
+}
+
+__STATIC_INLINE__ ggml_fp16_t ggml_tensor_get_f16(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
+    GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
+    return *(ggml_fp16_t*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
+}
+
+__STATIC_INLINE__ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false) {
+    printf("shape(%zu, %zu, %zu, %zu)\n", tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+    fflush(stdout);
+    if (shape_only) {
+        return;
+    }
+    int range = 3;
+    for (int i = 0; i < tensor->ne[3]; i++) {
+        if (i >= range && i + range < tensor->ne[3]) {
+            continue;
+        }
+        for (int j = 0; j < tensor->ne[2]; j++) {
+            if (j >= range && j + range < tensor->ne[2]) {
+                continue;
+            }
+            for (int k = 0; k < tensor->ne[1]; k++) {
+                if (k >= range && k + range < tensor->ne[1]) {
+                    continue;
+                }
+                for (int l = 0; l < tensor->ne[0]; l++) {
+                    if (l >= range && l + range < tensor->ne[0]) {
+                        continue;
+                    }
+                    if (tensor->type == GGML_TYPE_F32) {
+                        printf("  [%d, %d, %d, %d] = %f\n", i, j, k, l, ggml_tensor_get_f32(tensor, l, k, j, i));
+                    } else if (tensor->type == GGML_TYPE_F16) {
+                        printf("  [%d, %d, %d, %d] = %i\n", i, j, k, l, ggml_tensor_get_f16(tensor, l, k, j, i));
+                    }
+                    fflush(stdout);
+                }
+            }
+        }
+    }
+}
+
+__STATIC_INLINE__ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_path) {
+    std::ifstream file(file_path, std::ios::binary);
+    if (!file.is_open()) {
+        LOG_ERROR("failed to open '%s'", file_path.c_str());
+        return NULL;
+    }
+    int32_t n_dims;
+    int32_t length;
+    int32_t ttype;
+
+    file.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
+    file.read(reinterpret_cast<char*>(&length), sizeof(length));
+    file.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
+
+    if (file.eof()) {
+        LOG_ERROR("incomplete file '%s'", file_path.c_str());
+        return NULL;
+    }
+
+    int32_t nelements = 1;
+    int32_t ne[4]     = {1, 1, 1, 1};
+    for (int i = 0; i < n_dims; ++i) {
+        file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
+        nelements *= ne[i];
+    }
+    std::string name(length, 0);
+    file.read(&name[0], length);
+    ggml_tensor* tensor = ggml_new_tensor_4d(ctx, (ggml_type)ttype, ne[0], ne[1], ne[2], ne[3]);
+    const size_t bpe    = ggml_type_size(ggml_type(ttype));
+    file.read(reinterpret_cast<char*>(tensor->data), ggml_nbytes(tensor));
+    return tensor;
+}
+
+// __STATIC_INLINE__ void save_tensor_to_file(const std::string& file_name, ggml_tensor* tensor, const std::string & name) {
+//     std::string file_name_ = file_name + ".tensor";
+//     std::string name_ = name;
+//     std::ofstream file("./" + file_name_, std::ios::binary);
+//     file.write(reinterpret_cast<char*>(&tensor->n_dims), sizeof(tensor->n_dims));
+//     int len = (int)name_.size();
+//     file.write(reinterpret_cast<char*>(&len), sizeof(len));
+//     int ttype = (int)tensor->type;
+//     file.write(reinterpret_cast<char*>(&ttype), sizeof(ttype));
+//     for (int i = 0; i < tensor->n_dims; ++i) {
+//         int ne_ = (int) tensor->ne[i];
+//         file.write(reinterpret_cast<char*>(&ne_), sizeof(ne_));
+//     }
+//     file.write(&name_[0], len);
+//     char* data = nullptr;
+//     file.write((char*)tensor->data, ggml_nbytes(tensor));
+//     file.close();
+// }
+
+__STATIC_INLINE__ void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_tensor* src) {
+    if (dst->type == src->type) {
+        dst->nb[0] = src->nb[0];
+        dst->nb[1] = src->nb[1];
+        dst->nb[2] = src->nb[2];
+        dst->nb[3] = src->nb[3];
+
+        memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst));
+        return;
+    }
+    struct ggml_init_params params;
+    params.mem_size          = 10 * 1024 * 1024;  // for padding
+    params.mem_buffer        = NULL;
+    params.no_alloc          = false;
+    struct ggml_context* ctx = ggml_init(params);
+    if (!ctx) {
+        LOG_ERROR("ggml_init() failed");
+        return;
+    }
+    ggml_tensor* final = ggml_cpy_inplace(ctx, src, dst);
+
+    struct ggml_cgraph* graph = ggml_new_graph(ctx);
+    ggml_build_forward_expand(graph, final);
+    ggml_graph_compute_with_ctx(ctx, graph, 1);
+    ggml_free(ctx);
+}
+
+// SPECIAL OPERATIONS WITH TENSORS
+
+__STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input) {
+    int64_t width    = input->ne[0];
+    int64_t height   = input->ne[1];
+    int64_t channels = input->ne[2];
+    GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
+    uint8_t* image_data = (uint8_t*)malloc(width * height * channels);
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            for (int k = 0; k < channels; k++) {
+                float value                                               = ggml_tensor_get_f32(input, ix, iy, k);
+                *(image_data + iy * width * channels + ix * channels + k) = (uint8_t)(value * 255.0f);
+            }
+        }
+    }
+    return image_data;
+}
+
+__STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data,
+                                          struct ggml_tensor* output) {
+    int64_t width    = output->ne[0];
+    int64_t height   = output->ne[1];
+    int64_t channels = output->ne[2];
+    GGML_ASSERT(channels == 3 && output->type == GGML_TYPE_F32);
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            for (int k = 0; k < channels; k++) {
+                float value = *(image_data + iy * width * channels + ix * channels + k);
+                ggml_tensor_set_f32(output, value / 255.0f, ix, iy, k);
+            }
+        }
+    }
+}
+
+__STATIC_INLINE__ void ggml_split_tensor_2d(struct ggml_tensor* input,
+                                            struct ggml_tensor* output,
+                                            int x,
+                                            int y) {
+    int64_t width    = output->ne[0];
+    int64_t height   = output->ne[1];
+    int64_t channels = output->ne[2];
+    GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            for (int k = 0; k < channels; k++) {
+                float value = ggml_tensor_get_f32(input, ix + x, iy + y, k);
+                ggml_tensor_set_f32(output, value, ix, iy, k);
+            }
+        }
+    }
+}
+
+__STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
+                                            struct ggml_tensor* output,
+                                            int x,
+                                            int y,
+                                            int overlap) {
+    int64_t width    = input->ne[0];
+    int64_t height   = input->ne[1];
+    int64_t channels = input->ne[2];
+    GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            for (int k = 0; k < channels; k++) {
+                float new_value = ggml_tensor_get_f32(input, ix, iy, k);
+                if (overlap > 0) {  // blend colors in overlapped area
+                    float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k);
+                    if (x > 0 && ix < overlap) {  // in overlapped horizontal
+                        ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (ix / (1.0f * overlap)), x + ix, y + iy, k);
+                        continue;
+                    }
+                    if (y > 0 && iy < overlap) {  // in overlapped vertical
+                        ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (iy / (1.0f * overlap)), x + ix, y + iy, k);
+                        continue;
+                    }
+                }
+                ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
+            }
+        }
+    }
+}
+
+__STATIC_INLINE__ float ggml_tensor_mean(struct ggml_tensor* src) {
+    float mean        = 0.0f;
+    int64_t nelements = ggml_nelements(src);
+    float* data       = (float*)src->data;
+    for (int i = 0; i < nelements; i++) {
+        mean += data[i] / nelements * 1.0f;
+    }
+    return mean;
+}
+
+// a = a+b
+__STATIC_INLINE__ void ggml_tensor_add(struct ggml_tensor* a, struct ggml_tensor* b) {
+    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
+    int64_t nelements = ggml_nelements(a);
+    float* vec_a      = (float*)a->data;
+    float* vec_b      = (float*)b->data;
+    for (int i = 0; i < nelements; i++) {
+        vec_a[i] = vec_a[i] + vec_b[i];
+    }
+}
+
+__STATIC_INLINE__ void ggml_tensor_scale(struct ggml_tensor* src, float scale) {
+    int64_t nelements = ggml_nelements(src);
+    float* data       = (float*)src->data;
+    for (int i = 0; i < nelements; i++) {
+        data[i] = data[i] * scale;
+    }
+}
+
+__STATIC_INLINE__ void ggml_tensor_clamp(struct ggml_tensor* src, float min, float max) {
+    int64_t nelements = ggml_nelements(src);
+    float* data       = (float*)src->data;
+    for (int i = 0; i < nelements; i++) {
+        float val = data[i];
+        data[i]   = val < min ? min : (val > max ? max : val);
+    }
+}
+
+// convert values from [0, 1] to [-1, 1]
+__STATIC_INLINE__ void ggml_tensor_scale_input(struct ggml_tensor* src) {
+    int64_t nelements = ggml_nelements(src);
+    float* data       = (float*)src->data;
+    for (int i = 0; i < nelements; i++) {
+        float val = data[i];
+        data[i]   = val * 2.0f - 1.0f;
+    }
+}
+
+// convert values from [-1, 1] to [0, 1]
+__STATIC_INLINE__ void ggml_tensor_scale_output(struct ggml_tensor* src) {
+    int64_t nelements = ggml_nelements(src);
+    float* data       = (float*)src->data;
+    for (int i = 0; i < nelements; i++) {
+        float val = data[i];
+        data[i]   = (val + 1.0f) * 0.5f;
+    }
+}
+
+typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
+
+// Tiling
+__STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
+    int input_width   = (int)input->ne[0];
+    int input_height  = (int)input->ne[1];
+    int output_width  = (int)output->ne[0];
+    int output_height = (int)output->ne[1];
+    GGML_ASSERT(input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0);  // should be multiple of 2
+
+    int tile_overlap     = (int32_t)(tile_size * tile_overlap_factor);
+    int non_tile_overlap = tile_size - tile_overlap;
+
+    struct ggml_init_params params = {};
+    params.mem_size += tile_size * tile_size * input->ne[2] * sizeof(float);                       // input chunk
+    params.mem_size += (tile_size * scale) * (tile_size * scale) * output->ne[2] * sizeof(float);  // output chunk
+    params.mem_size += 3 * ggml_tensor_overhead();
+    params.mem_buffer = NULL;
+    params.no_alloc   = false;
+
+    LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
+
+    // draft context
+    struct ggml_context* tiles_ctx = ggml_init(params);
+    if (!tiles_ctx) {
+        LOG_ERROR("ggml_init() failed");
+        return;
+    }
+
+    // tiling
+    ggml_tensor* input_tile  = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size, tile_size, input->ne[2], 1);
+    ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1);
+    on_processing(input_tile, NULL, true);
+    int num_tiles = (input_width * input_height) / (non_tile_overlap * non_tile_overlap);
+    LOG_INFO("processing %i tiles", num_tiles);
+    pretty_progress(1, num_tiles, 0.0f);
+    int tile_count = 1;
+    bool last_y = false, last_x = false;
+    float last_time = 0.0f;
+    for (int y = 0; y < input_height && !last_y; y += non_tile_overlap) {
+        if (y + tile_size >= input_height) {
+            y      = input_height - tile_size;
+            last_y = true;
+        }
+        for (int x = 0; x < input_width && !last_x; x += non_tile_overlap) {
+            if (x + tile_size >= input_width) {
+                x      = input_width - tile_size;
+                last_x = true;
+            }
+            int64_t t1 = ggml_time_ms();
+            ggml_split_tensor_2d(input, input_tile, x, y);
+            on_processing(input_tile, output_tile, false);
+            ggml_merge_tensor_2d(output_tile, output, x * scale, y * scale, tile_overlap * scale);
+            int64_t t2 = ggml_time_ms();
+            last_time  = (t2 - t1) / 1000.0f;
+            pretty_progress(tile_count, num_tiles, last_time);
+            tile_count++;
+        }
+        last_x = false;
+    }
+    if (tile_count < num_tiles) {
+        pretty_progress(num_tiles, num_tiles, last_time);
+    }
+}
+
+__STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
+                                                         struct ggml_tensor* a) {
+    return ggml_group_norm(ctx, a, 32);
+}
+
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
+                                                     struct ggml_tensor* x,
+                                                     struct ggml_tensor* w,
+                                                     struct ggml_tensor* b) {
+    x = ggml_mul_mat(ctx, w, x);
+    x = ggml_add(ctx, x, b);
+    return x;
+}
+
+// w: [OC，IC, KH, KW]
+// x: [N, IC, IH, IW]
+// b: [OC,]
+// result: [N, OC, OH, OW]
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
+                                                      struct ggml_tensor* x,
+                                                      struct ggml_tensor* w,
+                                                      struct ggml_tensor* b,
+                                                      int s0 = 1,
+                                                      int s1 = 1,
+                                                      int p0 = 0,
+                                                      int p1 = 0,
+                                                      int d0 = 1,
+                                                      int d1 = 1) {
+    x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
+    if (b != NULL) {
+        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
+        x = ggml_add(ctx, x, b);
+    }
+    return x;
+}
+
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_layer_norm(struct ggml_context* ctx,
+                                                         struct ggml_tensor* x,
+                                                         struct ggml_tensor* w,
+                                                         struct ggml_tensor* b,
+                                                         float eps = EPS) {
+    x = ggml_norm(ctx, x, eps);
+    x = ggml_mul(ctx, x, w);
+    x = ggml_add(ctx, x, b);
+    return x;
+}
+
+__STATIC_INLINE__ struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ctx,
+                                                         struct ggml_tensor* x,
+                                                         struct ggml_tensor* w,
+                                                         struct ggml_tensor* b,
+                                                         int num_groups = 32) {
+    if (x->n_dims == 4) {
+        w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], 1);
+        b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
+    }
+
+    x = ggml_group_norm(ctx, x, num_groups);
+    x = ggml_mul(ctx, x, w);
+    x = ggml_add(ctx, x, b);
+    return x;
+}
+
+__STATIC_INLINE__ void ggml_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor* tensor, void* data, size_t offset, size_t size) {
+#ifdef SD_USE_CUBLAS
+    ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
+    ggml_backend_synchronize(backend);
+#else
+    ggml_backend_tensor_get(tensor, data, offset, size);
+#endif
+}
+
+__STATIC_INLINE__ float ggml_backend_tensor_get_f32(ggml_tensor* tensor) {
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16);
+    float value;
+    if (tensor->type == GGML_TYPE_F32) {
+        ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
+    } else {  // GGML_TYPE_F16
+        ggml_fp16_t f16_value;
+        ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
+        value = ggml_fp16_to_fp32(f16_value);
+    }
+    return value;
+}
+
+// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
+__STATIC_INLINE__ void set_timestep_embedding(struct ggml_tensor* timesteps, struct ggml_tensor* embedding, int dim, int max_period = 10000) {
+    // timesteps: [N,]
+    // embedding: [dim, N]
+    int half = dim / 2;
+    std::vector<float> freqs(half);
+    for (int i = 0; i < half; ++i) {
+        freqs[i] = (float)std::exp(-std::log(max_period) * i / half);
+    }
+    for (int i = 0; i < timesteps->ne[0]; ++i) {
+        for (int j = 0; j < half; ++j) {
+            float arg = ggml_get_f32_1d(timesteps, i) * freqs[j];
+            ggml_tensor_set_f32(embedding, std::cos(arg), j, i);
+            ggml_tensor_set_f32(embedding, std::sin(arg), j + half, i);
+        }
+        if (dim % 2 != 0) {
+            *(float*)((char*)embedding->data + i * embedding->nb[1] + dim * embedding->nb[0]) = 0;
+        }
+    }
+}
+
+__STATIC_INLINE__ struct ggml_tensor* new_timestep_embedding(struct ggml_context* ctx,
+                                                             struct ggml_allocr* allocr,
+                                                             struct ggml_tensor* timesteps,
+                                                             int dim,
+                                                             int max_period = 10000) {
+    // timesteps: [N,]
+    // embedding: [dim, N]
+    int acutual_dim = dim;
+    if (dim % 2 != 0) {
+        acutual_dim = dim + 1;
+    }
+    struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]);
+    if (allocr != NULL) {
+        ggml_allocr_alloc(allocr, embedding);
+    }
+    if (allocr != NULL && !ggml_allocr_is_measure(allocr)) {
+        set_timestep_embedding(timesteps, embedding, dim, max_period);
+    }
+    return embedding;
+}
+
+struct GGMLModule {
+    typedef std::function<struct ggml_cgraph*()> get_graph_cb_t;
+
+    std::string name                     = "ggml module";
+    struct ggml_context* params_ctx      = NULL;
+    size_t params_buffer_size            = 0;
+    size_t compute_buffer_size           = 0;
+    ggml_backend_buffer_t params_buffer  = NULL;
+    ggml_backend_buffer_t compute_buffer = NULL;  // for compute
+    struct ggml_allocr* compute_allocr   = NULL;
+
+    ggml_type wtype        = GGML_TYPE_F32;
+    ggml_backend_t backend = NULL;
+
+    virtual size_t calculate_mem_size() = 0;
+    virtual size_t get_num_tensors()    = 0;
+
+    bool alloc_params_buffer(ggml_backend_t backend_, ggml_type wtype_ = GGML_TYPE_F32) {
+        backend            = backend_;
+        wtype              = wtype_;
+        params_buffer_size = 10 * 1024 * 1024;  // 10 MB, for padding
+        params_buffer_size += calculate_mem_size();
+        size_t num_tensors = get_num_tensors();
+
+        LOG_DEBUG("%s params backend buffer size = % 6.2f MB (%i tensors)",
+                  name.c_str(), params_buffer_size / (1024.0 * 1024.0), num_tensors);
+
+        struct ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(num_tensors * ggml_tensor_overhead()) + 1 * 1024 * 1024;
+        params.mem_buffer = NULL;
+        params.no_alloc   = true;
+        // LOG_DEBUG("mem_size %u ", params.mem_size);
+
+        params_ctx = ggml_init(params);
+        if (!params_ctx) {
+            LOG_ERROR("ggml_init() failed");
+            return false;
+        }
+
+        params_buffer = ggml_backend_alloc_buffer(backend, params_buffer_size);
+        return true;
+    }
+
+    void free_params_buffer() {
+        if (params_ctx != NULL) {
+            ggml_free(params_ctx);
+            params_ctx = NULL;
+        }
+
+        if (params_buffer != NULL) {
+            ggml_backend_buffer_free(params_buffer);
+            params_buffer = NULL;
+        }
+    }
+
+    ~GGMLModule() {
+        free_params_buffer();
+    }
+
+    void alloc_compute_buffer(get_graph_cb_t get_graph) {
+        if (compute_buffer_size == 0) {
+            // alignment required by the backend
+            compute_allocr = ggml_allocr_new_measure_from_backend(backend);
+
+            struct ggml_cgraph* gf = get_graph();
+
+            // compute the required memory
+            compute_buffer_size = ggml_allocr_alloc_graph(compute_allocr, gf) + 1024 * 1024;
+
+            // recreate the allocator with the required memory
+            ggml_allocr_free(compute_allocr);
+
+            LOG_DEBUG("%s compute buffer size: %.2f MB", name.c_str(), compute_buffer_size / 1024.0 / 1024.0);
+        }
+
+        compute_buffer = ggml_backend_alloc_buffer(backend, compute_buffer_size);
+        compute_allocr = ggml_allocr_new_from_buffer(compute_buffer);
+    }
+
+    void compute(get_graph_cb_t get_graph, int n_threads, struct ggml_tensor* output = NULL) {
+        ggml_allocr_reset(compute_allocr);
+
+        struct ggml_cgraph* gf = get_graph();
+
+        ggml_allocr_alloc_graph(compute_allocr, gf);
+
+        if (ggml_backend_is_cpu(backend)) {
+            ggml_backend_cpu_set_n_threads(backend, n_threads);
+        }
+
+#ifdef SD_USE_METAL
+        if (ggml_backend_is_metal(backend)) {
+            ggml_backend_metal_set_n_cb(backend, n_threads);
+        }
+#endif
+
+        ggml_backend_graph_compute(backend, gf);
+
+#ifdef GGML_PERF
+        ggml_graph_print(gf);
+#endif
+
+        if (output != NULL) {
+            ggml_backend_tensor_get_and_sync(backend, gf->nodes[gf->n_nodes - 1], output->data, 0, ggml_nbytes(output));
+        }
+    }
+
+    void free_compute_buffer() {
+        ggml_allocr_free(compute_allocr);
+        ggml_backend_buffer_free(compute_buffer);
+        compute_allocr      = NULL;
+        compute_buffer_size = 0;
+    }
+};
+
+#endif  // __GGML_EXTEND__HPP__
--- a/lora.hpp
+++ b/lora.hpp
@@ -0,0 +1,185 @@
+#ifndef __LORA_HPP__
+#define __LORA_HPP__
+
+#include "ggml_extend.hpp"
+
+#define LORA_GRAPH_SIZE 10240
+
+struct LoraModel : public GGMLModule {
+    float multiplier = 1.0f;
+    std::map<std::string, struct ggml_tensor*> lora_tensors;
+    std::string file_path;
+    ModelLoader model_loader;
+    bool load_failed = false;
+
+    LoraModel(const std::string file_path = "")
+        : file_path(file_path) {
+        name = "lora";
+        if (!model_loader.init_from_file(file_path)) {
+            load_failed = true;
+        }
+    }
+
+    size_t get_num_tensors() {
+        return LORA_GRAPH_SIZE;
+    }
+
+    size_t calculate_mem_size() {
+        return model_loader.cal_mem_size(NULL);
+    }
+
+    bool load_from_file(ggml_backend_t backend) {
+        if (!alloc_params_buffer(backend)) {
+            return false;
+        }
+        LOG_INFO("loading LoRA from '%s'", file_path.c_str());
+
+        if (load_failed) {
+            LOG_ERROR("init lora model loader from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
+
+        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+            const std::string& name = tensor_storage.name;
+
+            struct ggml_tensor* real = ggml_new_tensor(params_ctx, tensor_storage.type, tensor_storage.n_dims, tensor_storage.ne);
+            ggml_allocr_alloc(alloc, real);
+
+            *dst_tensor = real;
+
+            lora_tensors[name] = real;
+            return true;
+        };
+
+        model_loader.load_tensors(on_new_tensor_cb, backend);
+
+        LOG_DEBUG("finished loaded lora");
+        ggml_allocr_free(alloc);
+        return true;
+    }
+
+    struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
+        // make a graph to compute all lora, expected lora and models tensors are in the same backend
+        // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+        static size_t buf_size = ggml_tensor_overhead() * LORA_GRAPH_SIZE + ggml_graph_overhead();
+        static std::vector<uint8_t> buf(buf_size);
+
+        struct ggml_init_params params = {
+            /*.mem_size   =*/buf_size,
+            /*.mem_buffer =*/buf.data(),
+            /*.no_alloc   =*/true,  // the tensors will be allocated later by ggml_allocr_alloc_graph()
+        };
+        // LOG_DEBUG("mem_size %u ", params.mem_size);
+
+        struct ggml_context* ctx0 = ggml_init(params);
+        struct ggml_cgraph* gf    = ggml_new_graph_custom(ctx0, LORA_GRAPH_SIZE, false);
+
+        std::set<std::string> applied_lora_tensors;
+        for (auto it : model_tensors) {
+            std::string k_tensor       = it.first;
+            struct ggml_tensor* weight = model_tensors[it.first];
+
+            size_t k_pos = k_tensor.find(".weight");
+            if (k_pos == std::string::npos) {
+                continue;
+            }
+            k_tensor = k_tensor.substr(0, k_pos);
+            replace_all_chars(k_tensor, '.', '_');
+            std::string lora_up_name   = "lora." + k_tensor + ".lora_up.weight";
+            std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
+            std::string alpha_name     = "lora." + k_tensor + ".alpha";
+            std::string scale_name     = "lora." + k_tensor + ".scale";
+
+            ggml_tensor* lora_up   = NULL;
+            ggml_tensor* lora_down = NULL;
+
+            if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
+                lora_up = lora_tensors[lora_up_name];
+            }
+
+            if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
+                lora_down = lora_tensors[lora_down_name];
+            }
+
+            if (lora_up == NULL || lora_down == NULL) {
+                continue;
+            }
+
+            applied_lora_tensors.insert(lora_up_name);
+            applied_lora_tensors.insert(lora_down_name);
+            applied_lora_tensors.insert(alpha_name);
+            applied_lora_tensors.insert(scale_name);
+
+            // calc_cale
+            int64_t dim       = lora_down->ne[lora_down->n_dims - 1];
+            float scale_value = 1.0f;
+            if (lora_tensors.find(scale_name) != lora_tensors.end()) {
+                scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
+            } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                scale_value = alpha / dim;
+            }
+            scale_value *= multiplier;
+
+            ggml_tensor* lora_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+
+            ggml_allocr_alloc(compute_allocr, lora_scale);
+            if (!ggml_allocr_is_measure(compute_allocr)) {
+                ggml_backend_tensor_set(lora_scale, &scale_value, 0, ggml_nbytes(lora_scale));
+            }
+
+            // flat lora tensors to multiply it
+            int64_t lora_up_rows   = lora_up->ne[lora_up->n_dims - 1];
+            lora_up                = ggml_reshape_2d(ctx0, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
+            int64_t lora_down_rows = lora_down->ne[lora_down->n_dims - 1];
+            lora_down              = ggml_reshape_2d(ctx0, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
+
+            // ggml_mul_mat requires tensor b transposed
+            lora_down                  = ggml_cont(ctx0, ggml_transpose(ctx0, lora_down));
+            struct ggml_tensor* updown = ggml_mul_mat(ctx0, lora_up, lora_down);
+            updown                     = ggml_cont(ctx0, ggml_transpose(ctx0, updown));
+            updown                     = ggml_reshape(ctx0, updown, weight);
+            GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
+            updown = ggml_scale_inplace(ctx0, updown, lora_scale);
+            ggml_tensor* final_weight;
+            // if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
+            //     final_weight = ggml_new_tensor(ctx0, GGML_TYPE_F32, weight->n_dims, weight->ne);
+            //     final_weight = ggml_cpy_inplace(ctx0, weight, final_weight);
+            //     final_weight = ggml_add_inplace(ctx0, final_weight, updown);
+            //     final_weight = ggml_cpy_inplace(ctx0, final_weight, weight);
+            // } else {
+            //     final_weight = ggml_add_inplace(ctx0, weight, updown);
+            // }
+            final_weight = ggml_add_inplace(ctx0, weight, updown);  // apply directly
+            ggml_build_forward_expand(gf, final_weight);
+        }
+
+        for (auto& kv : lora_tensors) {
+            if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
+                LOG_WARN("unused lora tensor %s", kv.first.c_str());
+            }
+        }
+
+        return gf;
+    }
+
+    void alloc_compute_buffer(std::map<std::string, struct ggml_tensor*> model_tensors) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(model_tensors);
+        };
+        GGMLModule::alloc_compute_buffer(get_graph);
+    }
+
+    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
+        alloc_compute_buffer(model_tensors);
+
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(model_tensors);
+        };
+        GGMLModule::compute(get_graph, n_threads);
+    }
+};
+
+#endif  // __LORA_HPP__
--- a/model.cpp
+++ b/model.cpp
@@ -1,6 +1,7 @@
 #include <stdarg.h>
 #include <fstream>
 #include <regex>
+#include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -1367,6 +1368,72 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
    return success;
 }

+bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
+                               ggml_backend_t backend,
+                               std::set<std::string> ignore_tensors) {
+    std::set<std::string> tensor_names_in_file;
+    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+        const std::string& name = tensor_storage.name;
+        tensor_names_in_file.insert(name);
+
+        struct ggml_tensor* real;
+        if (tensors.find(name) != tensors.end()) {
+            real = tensors[name];
+        } else {
+            if (ignore_tensors.find(name) == ignore_tensors.end()) {
+                LOG_WARN("unknown tensor '%s' in model file", name.c_str());
+            }
+            return true;
+        }
+
+        if (
+            real->ne[0] != tensor_storage.ne[0] ||
+            real->ne[1] != tensor_storage.ne[1] ||
+            real->ne[2] != tensor_storage.ne[2] ||
+            real->ne[3] != tensor_storage.ne[3]) {
+            LOG_ERROR(
+                "tensor '%s' has wrong shape in model file: "
+                "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
+                name.c_str(),
+                (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
+                (int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]);
+            return false;
+        }
+
+        *dst_tensor = real;
+
+        return true;
+    };
+
+    bool success = load_tensors(on_new_tensor_cb, backend);
+    if (!success) {
+        LOG_ERROR("load tensors from file failed");
+        return false;
+    }
+
+    bool some_tensor_not_init = false;
+
+    for (auto pair : tensors) {
+        if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) {
+            continue;
+        }
+
+        if (pair.first.find("alphas_cumprod") != std::string::npos) {
+            continue;
+        }
+
+        if (tensor_names_in_file.find(pair.first) == tensor_names_in_file.end()) {
+            LOG_ERROR("tensor '%s' not in model file", pair.first.c_str());
+            some_tensor_not_init = true;
+        }
+    }
+
+    if (some_tensor_not_init) {
+        return false;
+    }
+    return true;
+}
+
 int64_t ModelLoader::cal_mem_size(ggml_backend_t backend) {
    size_t alignment = 128;
    if (backend != NULL) {
--- a/model.h
+++ b/model.h
@@ -117,6 +117,9 @@ public:
    ggml_type get_sd_wtype();
    std::string load_merges();
    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
+    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
+                      ggml_backend_t backend,
+                      std::set<std::string> ignore_tensors = {});
    int64_t cal_mem_size(ggml_backend_t backend);
    ~ModelLoader() = default;
 };
--- a/rng.hpp
+++ b/rng.hpp
--- a/rng_philox.hpp
+++ b/rng_philox.hpp
@@ -4,7 +4,7 @@
 #include <cmath>
 #include <vector>

-#include "rng.h"
+#include "rng.hpp"

 // RNG imitiating torch cuda randn on CPU.
 // Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -1,19 +1,39 @@
 #ifndef __STABLE_DIFFUSION_H__
 #define __STABLE_DIFFUSION_H__

-#include <memory>
-#include <string>
-#include <vector>
-#include "ggml/ggml.h"
+#if defined(_WIN32) || defined(__CYGWIN__)
+#ifndef SD_BUILD_SHARED_LIB
+#define SD_API
+#else
+#ifdef SD_BUILD_DLL
+#define SD_API __declspec(dllexport)
+#else
+#define SD_API __declspec(dllimport)
+#endif
+#endif
+#else
+#if __GNUC__ >= 4
+#define SD_API __attribute__((visibility("default")))
+#else
+#define SD_API
+#endif
+#endif

-#include "ggml/ggml.h"
+#ifdef __cplusplus
+extern "C" {
+#endif

-enum RNGType {
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+enum rng_type_t {
    STD_DEFAULT_RNG,
    CUDA_RNG
 };

-enum SampleMethod {
+enum sample_method_t {
    EULER_A,
    EULER,
    HEUN,
@@ -25,59 +45,113 @@ enum SampleMethod {
    N_SAMPLE_METHODS
 };

-enum Schedule {
+enum schedule_t {
    DEFAULT,
    DISCRETE,
    KARRAS,
    N_SCHEDULES
 };

-class StableDiffusionGGML;
+// same as enum ggml_type
+enum sd_type_t {
+    SD_TYPE_F32  = 0,
+    SD_TYPE_F16  = 1,
+    SD_TYPE_Q4_0 = 2,
+    SD_TYPE_Q4_1 = 3,
+    // SD_TYPE_Q4_2 = 4, support has been removed
+    // SD_TYPE_Q4_3 (5) support has been removed
+    SD_TYPE_Q5_0 = 6,
+    SD_TYPE_Q5_1 = 7,
+    SD_TYPE_Q8_0 = 8,
+    SD_TYPE_Q8_1 = 9,
+    // k-quantizations
+    SD_TYPE_Q2_K = 10,
+    SD_TYPE_Q3_K = 11,
+    SD_TYPE_Q4_K = 12,
+    SD_TYPE_Q5_K = 13,
+    SD_TYPE_Q6_K = 14,
+    SD_TYPE_Q8_K = 15,
+    SD_TYPE_I8,
+    SD_TYPE_I16,
+    SD_TYPE_I32,
+    SD_TYPE_COUNT,
+};

-class StableDiffusion {
-private:
-    std::shared_ptr<StableDiffusionGGML> sd;
+SD_API const char* sd_type_name(enum sd_type_t type);

-public:
-    StableDiffusion(int n_threads                = -1,
-                    bool vae_decode_only         = false,
-                    std::string taesd_path       = "",
-                    std::string esrgan_path      = "",
-                    bool free_params_immediately = false,
-                    bool vae_tiling              = false,
-                    std::string lora_model_dir   = "",
-                    RNGType rng_type             = STD_DEFAULT_RNG);
+enum sd_log_level_t {
+    SD_LOG_DEBUG,
+    SD_LOG_INFO,
+    SD_LOG_WARN,
+    SD_LOG_ERROR
+};

-    bool load_from_file(const std::string& model_path,
-                        const std::string& vae_path,
-                        ggml_type wtype,
-                        Schedule d = DEFAULT,
-                        int clip_skip = -1);
+typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);

-    std::vector<uint8_t*> txt2img(
-        std::string prompt,
-        std::string negative_prompt,
+SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
+SD_API int32_t get_num_physical_cores();
+SD_API const char* sd_get_system_info();
+
+typedef struct {
+    uint32_t width;
+    uint32_t height;
+    uint32_t channel;
+    uint8_t* data;
+} sd_image_t;
+
+typedef struct sd_ctx_t sd_ctx_t;
+
+SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
+                            const char* vae_path,
+                            const char* taesd_path,
+                            const char* lora_model_dir,
+                            bool vae_decode_only,
+                            bool vae_tiling,
+                            bool free_params_immediately,
+                            int n_threads,
+                            enum sd_type_t wtype,
+                            enum rng_type_t rng_type,
+                            enum schedule_t s);
+
+SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
+
+SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
+                           const char* prompt,
+                           const char* negative_prompt,
+                           int clip_skip,
                           float cfg_scale,
                           int width,
                           int height,
-        SampleMethod sample_method,
+                           enum sample_method_t sample_method,
                           int sample_steps,
                           int64_t seed,
                           int batch_count);

-    std::vector<uint8_t*> img2img(
-        const uint8_t* init_img_data,
-        std::string prompt,
-        std::string negative_prompt,
+SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
+                           sd_image_t init_image,
+                           const char* prompt,
+                           const char* negative_prompt,
+                           int clip_skip,
                           float cfg_scale,
                           int width,
                           int height,
-        SampleMethod sample_method,
+                           enum sample_method_t sample_method,
                           int sample_steps,
                           float strength,
-        int64_t seed);
-};
+                           int64_t seed,
+                           int batch_count);

-std::string sd_get_system_info();
+typedef struct upscaler_ctx_t upscaler_ctx_t;
+
+SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
+                                        int n_threads,
+                                        enum sd_type_t wtype);
+SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
+
+SD_API sd_image_t upscale(upscaler_ctx_t*, sd_image_t input_image, uint32_t upscale_factor);
+
+#ifdef __cplusplus
+}
+#endif

 #endif  // __STABLE_DIFFUSION_H__
--- a/tae.hpp
+++ b/tae.hpp
@@ -0,0 +1,594 @@
+#ifndef __TAE_HPP__
+#define __TAE_HPP__
+
+#include "ggml_extend.hpp"
+
+#include "model.h"
+
+/*
+    ===================================    TinyAutoEncoder  ===================================
+    References:
+    https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoder_tiny.py
+    https://github.com/madebyollin/taesd/blob/main/taesd.py
+
+*/
+struct TAEBlock {
+    int in_channels;
+    int out_channels;
+
+    // conv
+    ggml_tensor* conv_0_w;  // [in_channels, out_channels, 3, 3]
+    ggml_tensor* conv_0_b;  // [in_channels]
+    ggml_tensor* conv_1_w;  // [out_channels, out_channels, 3, 3]
+    ggml_tensor* conv_1_b;  // [out_channels]
+    ggml_tensor* conv_2_w;  // [out_channels, out_channels, 3, 3]
+    ggml_tensor* conv_2_b;  // [out_channels]
+
+    // skip
+    ggml_tensor* conv_skip_w;  // [in_channels, out_channels, 1, 1]
+
+    size_t calculate_mem_size() {
+        size_t mem_size = in_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_0_w
+        mem_size += in_channels * ggml_type_size(GGML_TYPE_F32);                               // conv_0_b
+        mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);       // conv_1_w
+        mem_size += out_channels * ggml_type_size(GGML_TYPE_F32);                              // conv_1_b
+        mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);       // conv_1_w
+        mem_size += out_channels * ggml_type_size(GGML_TYPE_F32);                              // conv_1_b
+        mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);       // conv_2_w
+        mem_size += out_channels * ggml_type_size(GGML_TYPE_F32);                              // conv_2_b
+
+        if (in_channels != out_channels) {
+            mem_size += in_channels * out_channels * ggml_type_size(GGML_TYPE_F16);  // conv_skip_w
+        }
+        return mem_size;
+    }
+
+    int get_num_tensors() {
+        return 6 + (in_channels != out_channels ? 1 : 0);
+    }
+
+    void init_params(ggml_context* ctx) {
+        conv_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, in_channels);
+        conv_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
+        conv_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+
+        conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
+        conv_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+
+        if (in_channels != out_channels) {
+            conv_skip_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, out_channels, in_channels);
+        }
+    }
+
+    void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
+        tensors[prefix + "conv.0.weight"] = conv_0_w;
+        tensors[prefix + "conv.0.bias"]   = conv_0_b;
+
+        tensors[prefix + "conv.2.weight"] = conv_1_w;
+        tensors[prefix + "conv.2.bias"]   = conv_1_b;
+
+        tensors[prefix + "conv.4.weight"] = conv_2_w;
+        tensors[prefix + "conv.4.bias"]   = conv_2_b;
+
+        if (in_channels != out_channels) {
+            tensors[prefix + "skip.weight"] = conv_skip_w;
+        }
+    }
+
+    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* x) {
+        // conv(n_in, n_out)
+        ggml_tensor* h;
+        h = ggml_nn_conv_2d(ctx, x, conv_0_w, conv_0_b, 1, 1, 1, 1);
+        h = ggml_relu_inplace(ctx, h);
+        h = ggml_nn_conv_2d(ctx, h, conv_1_w, conv_1_b, 1, 1, 1, 1);
+        h = ggml_relu_inplace(ctx, h);
+        h = ggml_nn_conv_2d(ctx, h, conv_2_w, conv_2_b, 1, 1, 1, 1);
+
+        // skip connection
+        if (in_channels != out_channels) {
+            // skip = nn.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
+            x = ggml_nn_conv_2d(ctx, x, conv_skip_w, NULL, 1, 1, 1, 1);
+        }
+
+        h = ggml_add(ctx, h, x);
+        h = ggml_relu_inplace(ctx, h);
+        return h;
+    }
+};
+
+struct TinyEncoder {
+    int in_channels = 3;
+    int z_channels  = 4;
+    int channels    = 64;
+    int num_blocks  = 3;
+
+    // input
+    ggml_tensor* conv_input_w;  // [channels, in_channels, 3, 3]
+    ggml_tensor* conv_input_b;  // [channels]
+    TAEBlock initial_block;
+
+    ggml_tensor* conv_1_w;  // [channels, channels, 3, 3]
+    TAEBlock input_blocks[3];
+
+    // middle
+    ggml_tensor* conv_2_w;  // [channels, channels, 3, 3]
+    TAEBlock middle_blocks[3];
+
+    // output
+    ggml_tensor* conv_3_w;  // [channels, channels, 3, 3]
+    TAEBlock output_blocks[3];
+
+    // final
+    ggml_tensor* conv_final_w;  // [z_channels, channels, 3, 3]
+    ggml_tensor* conv_final_b;  // [z_channels]
+
+    TinyEncoder() {
+        for (int i = 0; i < num_blocks; i++) {
+            input_blocks[i].in_channels  = channels;
+            input_blocks[i].out_channels = channels;
+
+            middle_blocks[i].in_channels  = channels;
+            middle_blocks[i].out_channels = channels;
+
+            output_blocks[i].in_channels  = channels;
+            output_blocks[i].out_channels = channels;
+        }
+
+        initial_block.in_channels  = channels;
+        initial_block.out_channels = channels;
+    }
+
+    size_t calculate_mem_size() {
+        size_t mem_size = channels * in_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_input_w
+        mem_size += channels * ggml_type_size(GGML_TYPE_F32);                              // conv_input_b
+
+        mem_size += initial_block.calculate_mem_size();
+
+        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_1_w
+        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_2_w
+        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_3_w
+
+        for (int i = 0; i < num_blocks; i++) {
+            mem_size += input_blocks[i].calculate_mem_size();
+            mem_size += middle_blocks[i].calculate_mem_size();
+            mem_size += output_blocks[i].calculate_mem_size();
+        }
+        mem_size += z_channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_input_w
+        mem_size += z_channels * ggml_type_size(GGML_TYPE_F32);                     // conv_input_b
+        return mem_size;
+    }
+
+    int get_num_tensors() {
+        int num_tensors = 7;
+        for (int i = 0; i < num_blocks; i++) {
+            num_tensors += input_blocks[i].get_num_tensors();
+            num_tensors += middle_blocks[i].get_num_tensors();
+            num_tensors += output_blocks[i].get_num_tensors();
+        }
+        num_tensors += initial_block.get_num_tensors();
+        return num_tensors;
+    }
+
+    void init_params(ggml_context* ctx) {
+        conv_input_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, channels);
+        conv_input_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
+
+        initial_block.init_params(ctx);
+
+        conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
+        conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
+        conv_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
+
+        conv_final_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, z_channels);
+        conv_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_channels);
+
+        for (int i = 0; i < num_blocks; i++) {
+            input_blocks[i].init_params(ctx);
+            middle_blocks[i].init_params(ctx);
+            output_blocks[i].init_params(ctx);
+        }
+    }
+
+    void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
+        tensors[prefix + "0.weight"] = conv_input_w;
+        tensors[prefix + "0.bias"]   = conv_input_b;
+
+        initial_block.map_by_name(tensors, prefix + "1.");
+
+        tensors[prefix + "2.weight"] = conv_1_w;
+        for (int i = 0; i < num_blocks; i++) {
+            input_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 3) + ".");
+        }
+
+        tensors[prefix + "6.weight"] = conv_2_w;
+        for (int i = 0; i < num_blocks; i++) {
+            middle_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 7) + ".");
+        }
+
+        tensors[prefix + "10.weight"] = conv_3_w;
+        for (int i = 0; i < num_blocks; i++) {
+            output_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 11) + ".");
+        }
+
+        tensors[prefix + "14.weight"] = conv_final_w;
+        tensors[prefix + "14.bias"]   = conv_final_b;
+    }
+
+    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* x) {
+        // conv(3, 64)
+        auto z = ggml_nn_conv_2d(ctx, x, conv_input_w, conv_input_b, 1, 1, 1, 1);
+
+        // Block(64, 64)
+        z = initial_block.forward(ctx, z);
+
+        // conv(64, 64, stride=2, bias=False)
+        z = ggml_nn_conv_2d(ctx, z, conv_1_w, NULL, 2, 2, 1, 1);
+
+        // Block(64, 64), Block(64, 64), Block(64, 64)
+        for (int i = 0; i < num_blocks; i++) {
+            z = input_blocks[i].forward(ctx, z);
+        }
+
+        // conv(64, 64, stride=2, bias=False)
+        z = ggml_nn_conv_2d(ctx, z, conv_2_w, NULL, 2, 2, 1, 1);
+
+        // Block(64, 64), Block(64, 64), Block(64, 64)
+        for (int i = 0; i < num_blocks; i++) {
+            z = middle_blocks[i].forward(ctx, z);
+        }
+
+        // conv(64, 64, stride=2, bias=False)
+        z = ggml_nn_conv_2d(ctx, z, conv_3_w, NULL, 2, 2, 1, 1);
+
+        // Block(64, 64), Block(64, 64), Block(64, 64)
+        for (int i = 0; i < num_blocks; i++) {
+            z = output_blocks[i].forward(ctx, z);
+        }
+
+        // conv(64, 4)
+        z = ggml_nn_conv_2d(ctx, z, conv_final_w, conv_final_b, 1, 1, 1, 1);
+        return z;
+    }
+};
+
+struct TinyDecoder {
+    int z_channels      = 4;
+    int channels        = 64;
+    int output_channels = 3;
+    int num_blocks      = 3;
+
+    // input
+    ggml_tensor* conv_input_w;  // [channels, z_channels, 3, 3]
+    ggml_tensor* conv_input_b;  // [channels]
+    TAEBlock input_blocks[3];
+    ggml_tensor* conv_1_w;  // [channels, channels, 3, 3]
+
+    // middle
+    TAEBlock middle_blocks[3];
+    ggml_tensor* conv_2_w;  // [channels, channels, 3, 3]
+
+    // output
+    TAEBlock output_blocks[3];
+    ggml_tensor* conv_3_w;  // [channels, channels, 3, 3]
+
+    // final
+    TAEBlock final_block;
+    ggml_tensor* conv_final_w;  // [output_channels, channels, 3, 3]
+    ggml_tensor* conv_final_b;  // [output_channels]
+
+    ggml_tensor* in_scale_1d3;  // [1]
+    ggml_tensor* in_scale_3;    // [1]
+
+    TinyDecoder() {
+        for (int i = 0; i < num_blocks; i++) {
+            input_blocks[i].in_channels  = channels;
+            input_blocks[i].out_channels = channels;
+
+            middle_blocks[i].in_channels  = channels;
+            middle_blocks[i].out_channels = channels;
+
+            output_blocks[i].in_channels  = channels;
+            output_blocks[i].out_channels = channels;
+        }
+
+        final_block.in_channels  = channels;
+        final_block.out_channels = channels;
+    }
+
+    size_t calculate_mem_size() {
+        size_t mem_size = channels * z_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_input_w
+        mem_size += channels * ggml_type_size(GGML_TYPE_F32);                             // conv_input_b
+
+        for (int i = 0; i < num_blocks; i++) {
+            mem_size += input_blocks[i].calculate_mem_size();
+        }
+        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_1_w
+
+        for (int i = 0; i < num_blocks; i++) {
+            mem_size += middle_blocks[i].calculate_mem_size();
+        }
+        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_2_w
+
+        for (int i = 0; i < num_blocks; i++) {
+            mem_size += output_blocks[i].calculate_mem_size();
+        }
+        mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_3_w
+
+        mem_size += final_block.calculate_mem_size();
+        mem_size += output_channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16);  // conv_input_w
+        mem_size += output_channels * ggml_type_size(GGML_TYPE_F32);                     // conv_input_b
+        return mem_size;
+    }
+
+    int get_num_tensors() {
+        int num_tensors = 9;
+        for (int i = 0; i < num_blocks; i++) {
+            num_tensors += input_blocks[i].get_num_tensors();
+            num_tensors += middle_blocks[i].get_num_tensors();
+            num_tensors += output_blocks[i].get_num_tensors();
+        }
+        num_tensors += final_block.get_num_tensors();
+        return num_tensors;
+    }
+
+    void init_params(ggml_allocr* alloc, ggml_context* ctx) {
+        conv_input_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, z_channels, channels);
+        conv_input_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
+
+        conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
+        conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
+        conv_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
+
+        conv_final_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, output_channels);
+        conv_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, output_channels);
+
+        for (int i = 0; i < num_blocks; i++) {
+            input_blocks[i].init_params(ctx);
+            middle_blocks[i].init_params(ctx);
+            output_blocks[i].init_params(ctx);
+        }
+
+        final_block.init_params(ctx);
+
+        // initialize constants scales
+        in_scale_1d3 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+        in_scale_3   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+        ggml_allocr_alloc(alloc, in_scale_1d3);
+        float scale_1d3 = 1.0f / 3.0f;
+        ggml_backend_tensor_set(in_scale_1d3, &scale_1d3, 0, sizeof(scale_1d3));
+        ggml_allocr_alloc(alloc, in_scale_3);
+        float scale_3 = 3.0f;
+        ggml_backend_tensor_set(in_scale_3, &scale_3, 0, sizeof(scale_3));
+    }
+
+    void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
+        tensors[prefix + "0.weight"] = conv_input_w;
+        tensors[prefix + "0.bias"]   = conv_input_b;
+
+        for (int i = 0; i < num_blocks; i++) {
+            input_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 2) + ".");
+        }
+
+        tensors[prefix + "6.weight"] = conv_1_w;
+        for (int i = 0; i < num_blocks; i++) {
+            middle_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 7) + ".");
+        }
+
+        tensors[prefix + "11.weight"] = conv_2_w;
+        for (int i = 0; i < num_blocks; i++) {
+            output_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 12) + ".");
+        }
+
+        tensors[prefix + "16.weight"] = conv_3_w;
+
+        final_block.map_by_name(tensors, prefix + "17.");
+
+        tensors[prefix + "18.weight"] = conv_final_w;
+        tensors[prefix + "18.bias"]   = conv_final_b;
+    }
+
+    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* z) {
+        // torch.tanh(x / 3) * 3
+        auto h = ggml_scale(ctx, z, in_scale_1d3);
+        h      = ggml_tanh_inplace(ctx, h);
+        h      = ggml_scale(ctx, h, in_scale_3);
+
+        // conv(4, 64)
+        h = ggml_nn_conv_2d(ctx, h, conv_input_w, conv_input_b, 1, 1, 1, 1);
+
+        // nn.ReLU()
+        h = ggml_relu_inplace(ctx, h);
+
+        // Block(64, 64), Block(64, 64), Block(64, 64)
+        for (int i = 0; i < num_blocks; i++) {
+            h = input_blocks[i].forward(ctx, h);
+        }
+
+        // nn.Upsample(scale_factor=2)
+        h = ggml_upscale(ctx, h, 2);
+
+        // conv(64, 64, bias=False)
+        h = ggml_nn_conv_2d(ctx, h, conv_1_w, NULL, 1, 1, 1, 1);
+
+        // Block(64, 64), Block(64, 64), Block(64, 64)
+        for (int i = 0; i < num_blocks; i++) {
+            h = middle_blocks[i].forward(ctx, h);
+        }
+
+        // nn.Upsample(scale_factor=2)
+        h = ggml_upscale(ctx, h, 2);
+
+        // conv(64, 64, bias=False)
+        h = ggml_nn_conv_2d(ctx, h, conv_2_w, NULL, 1, 1, 1, 1);
+
+        // Block(64, 64), Block(64, 64), Block(64, 64)
+        for (int i = 0; i < num_blocks; i++) {
+            h = output_blocks[i].forward(ctx, h);
+        }
+
+        // nn.Upsample(scale_factor=2)
+        h = ggml_upscale(ctx, h, 2);
+
+        // conv(64, 64, bias=False)
+        h = ggml_nn_conv_2d(ctx, h, conv_3_w, NULL, 1, 1, 1, 1);
+
+        // Block(64, 64)
+        h = final_block.forward(ctx, h);
+
+        // conv(64, 3)
+        h = ggml_nn_conv_2d(ctx, h, conv_final_w, conv_final_b, 1, 1, 1, 1);
+        return h;
+    }
+};
+
+struct TinyAutoEncoder : public GGMLModule {
+    TinyEncoder encoder;
+    TinyDecoder decoder;
+    bool decode_only = false;
+
+    TinyAutoEncoder(bool decoder_only_ = true)
+        : decode_only(decoder_only_) {
+        name = "tae";
+    }
+
+    size_t calculate_mem_size() {
+        size_t mem_size = decoder.calculate_mem_size();
+        if (!decode_only) {
+            mem_size += encoder.calculate_mem_size();
+        }
+        mem_size += 1024;  // padding
+        return mem_size;
+    }
+
+    size_t get_num_tensors() {
+        size_t num_tensors = decoder.get_num_tensors();
+        if (!decode_only) {
+            num_tensors += encoder.get_num_tensors();
+        }
+        return num_tensors;
+    }
+
+    void init_params() {
+        ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
+        decoder.init_params(alloc, params_ctx);
+        if (!decode_only) {
+            encoder.init_params(params_ctx);
+        }
+
+        // alloc all tensors linked to this context
+        for (struct ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
+            if (t->data == NULL) {
+                ggml_allocr_alloc(alloc, t);
+            }
+        }
+        ggml_allocr_free(alloc);
+    }
+
+    void map_by_name(std::map<std::string, ggml_tensor*>& tensors) {
+        decoder.map_by_name(tensors, "decoder.layers.");
+        encoder.map_by_name(tensors, "encoder.layers.");
+    }
+
+    bool load_from_file(const std::string& file_path, ggml_backend_t backend) {
+        LOG_INFO("loading taesd from '%s'", file_path.c_str());
+
+        if (!alloc_params_buffer(backend)) {
+            return false;
+        }
+
+        std::map<std::string, ggml_tensor*> taesd_tensors;
+
+        // prepare memory for the weights
+        {
+            init_params();
+            map_by_name(taesd_tensors);
+        }
+
+        std::map<std::string, struct ggml_tensor*> tensors_need_to_load;
+        std::set<std::string> ignore_tensors;
+        for (auto& pair : taesd_tensors) {
+            const std::string& name = pair.first;
+
+            if (decode_only && starts_with(name, "encoder")) {
+                ignore_tensors.insert(name);
+                continue;
+            }
+
+            tensors_need_to_load.insert(pair);
+        }
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file(file_path)) {
+            LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
+            return false;
+        }
+
+        bool success = model_loader.load_tensors(tensors_need_to_load, backend, ignore_tensors);
+
+        if (!success) {
+            LOG_ERROR("load tae tensors from model loader failed");
+            return false;
+        }
+
+        LOG_INFO("taesd model loaded");
+        return success;
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
+        // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+        static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+        static std::vector<uint8_t> buf(buf_size);
+
+        struct ggml_init_params params = {
+            /*.mem_size   =*/buf_size,
+            /*.mem_buffer =*/buf.data(),
+            /*.no_alloc   =*/true,  // the tensors will be allocated later by ggml_allocr_alloc_graph()
+        };
+        // LOG_DEBUG("mem_size %u ", params.mem_size);
+
+        struct ggml_context* ctx0 = ggml_init(params);
+
+        struct ggml_cgraph* gf = ggml_new_graph(ctx0);
+
+        struct ggml_tensor* z_ = NULL;
+
+        // it's performing a compute, check if backend isn't cpu
+        if (!ggml_backend_is_cpu(backend)) {
+            // pass input tensors to gpu memory
+            z_ = ggml_dup_tensor(ctx0, z);
+            ggml_allocr_alloc(compute_allocr, z_);
+
+            // pass data to device backend
+            if (!ggml_allocr_is_measure(compute_allocr)) {
+                ggml_backend_tensor_set(z_, z->data, 0, ggml_nbytes(z));
+            }
+        } else {
+            z_ = z;
+        }
+
+        struct ggml_tensor* out = decode_graph ? decoder.forward(ctx0, z_) : encoder.forward(ctx0, z_);
+
+        ggml_build_forward_expand(gf, out);
+        ggml_free(ctx0);
+
+        return gf;
+    }
+
+    void alloc_compute_buffer(struct ggml_tensor* x, bool decode) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(x, decode);
+        };
+        GGMLModule::alloc_compute_buffer(get_graph);
+    }
+
+    void compute(struct ggml_tensor* work_result, int n_threads, struct ggml_tensor* z, bool decode_graph) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(z, decode_graph);
+        };
+        GGMLModule::compute(get_graph, n_threads, work_result);
+    }
+};
+
+#endif  // __TAE_HPP__
--- a/unet.hpp
+++ b/unet.hpp
--- a/upscaler.cpp
+++ b/upscaler.cpp
@@ -0,0 +1,126 @@
+#include "esrgan.hpp"
+#include "ggml_extend.hpp"
+#include "model.h"
+#include "stable-diffusion.h"
+
+struct UpscalerGGML {
+    ggml_backend_t backend    = NULL;  // general backend
+    ggml_type model_data_type = GGML_TYPE_F16;
+    ESRGAN esrgan_upscaler;
+    std::string esrgan_path;
+    int n_threads;
+
+    UpscalerGGML(int n_threads)
+        : n_threads(n_threads) {
+    }
+
+    bool load_from_file(const std::string& esrgan_path) {
+#ifdef SD_USE_CUBLAS
+        LOG_DEBUG("Using CUDA backend");
+        backend = ggml_backend_cuda_init(0);
+#endif
+#ifdef SD_USE_METAL
+        LOG_DEBUG("Using Metal backend");
+        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        backend = ggml_backend_metal_init();
+#endif
+
+        if (!backend) {
+            LOG_DEBUG("Using CPU backend");
+            backend = ggml_backend_cpu_init();
+        }
+        LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
+        if (!esrgan_upscaler.load_from_file(esrgan_path, backend)) {
+            return false;
+        }
+        return true;
+    }
+
+    sd_image_t upscale(sd_image_t input_image, uint32_t upscale_factor) {
+        // upscale_factor, unused for RealESRGAN_x4plus_anime_6B.pth
+        sd_image_t upscaled_image = {0, 0, 0, NULL};
+        int output_width          = (int)input_image.width * esrgan_upscaler.scale;
+        int output_height         = (int)input_image.height * esrgan_upscaler.scale;
+        LOG_INFO("upscaling from (%i x %i) to (%i x %i)",
+                 input_image.width, input_image.height, output_width, output_height);
+
+        struct ggml_init_params params;
+        params.mem_size = output_width * output_height * 3 * sizeof(float) * 2;
+        params.mem_size += 2 * ggml_tensor_overhead();
+        params.mem_buffer = NULL;
+        params.no_alloc   = false;
+
+        // draft context
+        struct ggml_context* upscale_ctx = ggml_init(params);
+        if (!upscale_ctx) {
+            LOG_ERROR("ggml_init() failed");
+            return upscaled_image;
+        }
+        LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
+        ggml_tensor* input_image_tensor = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, input_image.width, input_image.height, 3, 1);
+        sd_image_to_tensor(input_image.data, input_image_tensor);
+
+        ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
+        auto on_tiling        = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+            if (init) {
+                esrgan_upscaler.alloc_compute_buffer(in);
+            } else {
+                esrgan_upscaler.compute(out, n_threads, in);
+            }
+        };
+        int64_t t0 = ggml_time_ms();
+        sd_tiling(input_image_tensor, upscaled, esrgan_upscaler.scale, esrgan_upscaler.tile_size, 0.25f, on_tiling);
+        esrgan_upscaler.free_compute_buffer();
+        ggml_tensor_clamp(upscaled, 0.f, 1.f);
+        uint8_t* upscaled_data = sd_tensor_to_image(upscaled);
+        ggml_free(upscale_ctx);
+        int64_t t3 = ggml_time_ms();
+        LOG_INFO("input_image_tensor upscaled, taking %.2fs", (t3 - t0) / 1000.0f);
+        upscaled_image = {
+            (uint32_t)output_width,
+            (uint32_t)output_height,
+            3,
+            upscaled_data,
+        };
+        return upscaled_image;
+    }
+};
+
+struct upscaler_ctx_t {
+    UpscalerGGML* upscaler = NULL;
+};
+
+upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
+                                 int n_threads,
+                                 enum sd_type_t wtype) {
+    upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
+    if (upscaler_ctx == NULL) {
+        return NULL;
+    }
+    std::string esrgan_path(esrgan_path_c_str);
+
+    upscaler_ctx->upscaler = new UpscalerGGML(n_threads);
+    if (upscaler_ctx->upscaler == NULL) {
+        return NULL;
+    }
+
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path)) {
+        delete upscaler_ctx->upscaler;
+        upscaler_ctx->upscaler = NULL;
+        free(upscaler_ctx);
+        return NULL;
+    }
+    return upscaler_ctx;
+}
+
+sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor) {
+    return upscaler_ctx->upscaler->upscale(input_image, upscale_factor);
+}
+
+void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx) {
+    if (upscaler_ctx->upscaler != NULL) {
+        delete upscaler_ctx->upscaler;
+        upscaler_ctx->upscaler = NULL;
+    }
+    free(upscaler_ctx);
+}
--- a/util.cpp
+++ b/util.cpp
@@ -4,6 +4,8 @@
 #include <codecvt>
 #include <fstream>
 #include <locale>
+#include <sstream>
+#include <string>
 #include <thread>
 #include <unordered_set>
 #include <vector>
@@ -18,6 +20,9 @@
 #include <unistd.h>
 #endif

+#include "ggml/ggml.h"
+#include "stable-diffusion.h"
+
 bool ends_with(const std::string& str, const std::string& ending) {
    if (str.length() >= ending.length()) {
        return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
@@ -136,7 +141,7 @@ std::u32string unicode_value_to_utf32(int unicode_value) {
    return utf32_string;
 }

-std::string basename(const std::string& path) {
+std::string sd_basename(const std::string& path) {
    size_t pos = path.find_last_of('/');
    if (pos != std::string::npos) {
        return path.substr(pos + 1);
@@ -164,40 +169,90 @@ std::string path_join(const std::string& p1, const std::string& p2) {
    return p1 + "/" + p2;
 }

-static SDLogLevel log_level = SDLogLevel::INFO;
-
-void set_sd_log_level(SDLogLevel level) {
-    log_level = level;
+void pretty_progress(int step, int steps, float time) {
+    std::string progress = "  |";
+    int max_progress     = 50;
+    int32_t current      = (int32_t)(step * 1.f * max_progress / steps);
+    for (int i = 0; i < 50; i++) {
+        if (i > current) {
+            progress += " ";
+        } else if (i == current && i != max_progress - 1) {
+            progress += ">";
+        } else {
+            progress += "=";
+        }
+    }
+    progress += "|";
+    printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s",
+           progress.c_str(), step, steps,
+           time > 1.0f || time == 0 ? time : (1.0f / time));
+    fflush(stdout);  // for linux
+    if (step == steps) {
+        printf("\n");
+    }
 }

-void log_printf(SDLogLevel level, const char* file, int line, const char* format, ...) {
-    if (level < log_level) {
-        return;
-    }
+static sd_log_cb_t sd_log_cb = NULL;
+void* sd_log_cb_data         = NULL;
+
+#define LOG_BUFFER_SIZE 1024
+
+void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...) {
    va_list args;
    va_start(args, format);

-    if (level == SDLogLevel::DEBUG) {
-        printf("[DEBUG] %s:%-4d - ", basename(file).c_str(), line);
-        vprintf(format, args);
-        printf("\n");
-        fflush(stdout);
-    } else if (level == SDLogLevel::INFO) {
-        printf("[INFO]  %s:%-4d - ", basename(file).c_str(), line);
-        vprintf(format, args);
-        printf("\n");
-        fflush(stdout);
-    } else if (level == SDLogLevel::WARN) {
-        fprintf(stdout, "[WARN]  %s:%-4d - ", basename(file).c_str(), line);
-        vfprintf(stdout, format, args);
-        fprintf(stdout, "\n");
-        fflush(stdout);
-    } else {
-        fprintf(stderr, "[ERROR] %s:%-4d - ", basename(file).c_str(), line);
-        vfprintf(stderr, format, args);
-        fprintf(stderr, "\n");
-        fflush(stderr);
+    const char* level_str = "DEBUG";
+    if (level == SD_LOG_INFO) {
+        level_str = "INFO ";
+    } else if (level == SD_LOG_WARN) {
+        level_str = "WARN ";
+    } else if (level == SD_LOG_ERROR) {
+        level_str = "ERROR";
+    }
+
+    static char log_buffer[LOG_BUFFER_SIZE];
+
+    int written = snprintf(log_buffer, LOG_BUFFER_SIZE, "[%s] %s:%-4d - ", level_str, sd_basename(file).c_str(), line);
+
+    if (written >= 0 && written < LOG_BUFFER_SIZE) {
+        vsnprintf(log_buffer + written, LOG_BUFFER_SIZE - written, format, args);
+        strncat(log_buffer, "\n", LOG_BUFFER_SIZE - strlen(log_buffer) - 1);
+    }
+
+    if (sd_log_cb) {
+        sd_log_cb(level, log_buffer, sd_log_cb_data);
    }

    va_end(args);
 }
+
+void sd_set_log_callback(sd_log_cb_t cb, void* data) {
+    sd_log_cb      = cb;
+    sd_log_cb_data = data;
+}
+
+const char* sd_get_system_info() {
+    static char buffer[1024];
+    std::stringstream ss;
+    ss << "System Info: \n";
+    ss << "    BLAS = " << ggml_cpu_has_blas() << std::endl;
+    ss << "    SSE3 = " << ggml_cpu_has_sse3() << std::endl;
+    ss << "    AVX = " << ggml_cpu_has_avx() << std::endl;
+    ss << "    AVX2 = " << ggml_cpu_has_avx2() << std::endl;
+    ss << "    AVX512 = " << ggml_cpu_has_avx512() << std::endl;
+    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
+    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
+    ss << "    FMA = " << ggml_cpu_has_fma() << std::endl;
+    ss << "    NEON = " << ggml_cpu_has_neon() << std::endl;
+    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
+    ss << "    F16C = " << ggml_cpu_has_f16c() << std::endl;
+    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
+    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
+    ss << "    VSX = " << ggml_cpu_has_vsx() << std::endl;
+    snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
+    return buffer;
+}
+
+const char* sd_type_name(enum sd_type_t type) {
+    return ggml_type_name((ggml_type)type);
+}
--- a/util.h
+++ b/util.h
@@ -1,8 +1,10 @@
 #ifndef __UTIL_H__
 #define __UTIL_H__

-#include <string>
 #include <cstdint>
+#include <string>
+
+#include "stable-diffusion.h"

 bool ends_with(const std::string& str, const std::string& ending);
 bool starts_with(const std::string& str, const std::string& start);
@@ -18,25 +20,16 @@ std::u32string utf8_to_utf32(const std::string& utf8_str);
 std::string utf32_to_utf8(const std::u32string& utf32_str);
 std::u32string unicode_value_to_utf32(int unicode_value);

-std::string basename(const std::string& path);
+std::string sd_basename(const std::string& path);

 std::string path_join(const std::string& p1, const std::string& p2);

-int32_t get_num_physical_cores();
+void pretty_progress(int step, int steps, float time);

-enum SDLogLevel {
-    DEBUG,
-    INFO,
-    WARN,
-    ERROR
-};
+void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);

-void set_sd_log_level(SDLogLevel level);
-
-void log_printf(SDLogLevel level, const char* file, int line, const char* format, ...);
-
-#define LOG_DEBUG(format, ...) log_printf(SDLogLevel::DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
-#define LOG_INFO(format, ...) log_printf(SDLogLevel::INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
-#define LOG_WARN(format, ...) log_printf(SDLogLevel::WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
-#define LOG_ERROR(format, ...) log_printf(SDLogLevel::ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#define LOG_ERROR(format, ...) log_printf(SD_LOG_ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #endif  // __UTIL_H__
--- a/vae.hpp
+++ b/vae.hpp
@@ -0,0 +1,747 @@
+#ifndef __VAE_HPP__
+#define __VAE_HPP__
+
+#include "common.hpp"
+#include "ggml_extend.hpp"
+
+/*================================================== AutoEncoderKL ===================================================*/
+
+#define VAE_GRAPH_SIZE 10240
+
+struct ResnetBlock {
+    // network hparams
+    int in_channels;
+    int out_channels;
+
+    // network params
+    struct ggml_tensor* norm1_w;  // [in_channels, ]
+    struct ggml_tensor* norm1_b;  // [in_channels, ]
+
+    struct ggml_tensor* conv1_w;  // [out_channels, in_channels, 3, 3]
+    struct ggml_tensor* conv1_b;  // [out_channels, ]
+
+    struct ggml_tensor* norm2_w;  // [out_channels, ]
+    struct ggml_tensor* norm2_b;  // [out_channels, ]
+
+    struct ggml_tensor* conv2_w;  // [out_channels, out_channels, 3, 3]
+    struct ggml_tensor* conv2_b;  // [out_channels, ]
+
+    // nin_shortcut, only if out_channels != in_channels
+    struct ggml_tensor* nin_shortcut_w;  // [out_channels, in_channels, 1, 1]
+    struct ggml_tensor* nin_shortcut_b;  // [out_channels, ]
+
+    size_t calculate_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32);                      // norm1_w/b
+        mem_size += out_channels * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);   // conv1_w
+        mem_size += 4 * out_channels * ggml_type_sizef(GGML_TYPE_F32);                     // conv1_b/norm2_w/norm2_b/conv2_b
+        mem_size += out_channels * out_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv2_w
+
+        if (out_channels != in_channels) {
+            mem_size += out_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // nin_shortcut_w
+            mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                        // nin_shortcut_b
+        }
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_type wtype) {
+        norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        conv1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, out_channels);
+        conv1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+
+        norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+        norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+        conv2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
+        conv2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+
+        if (out_channels != in_channels) {
+            nin_shortcut_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, out_channels);
+            nin_shortcut_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
+        }
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "norm1.weight"] = norm1_w;
+        tensors[prefix + "norm1.bias"]   = norm1_b;
+        tensors[prefix + "conv1.weight"] = conv1_w;
+        tensors[prefix + "conv1.bias"]   = conv1_b;
+
+        tensors[prefix + "norm2.weight"] = norm2_w;
+        tensors[prefix + "norm2.bias"]   = norm2_b;
+        tensors[prefix + "conv2.weight"] = conv2_w;
+        tensors[prefix + "conv2.bias"]   = conv2_b;
+
+        if (out_channels != in_channels) {
+            tensors[prefix + "nin_shortcut.weight"] = nin_shortcut_w;
+            tensors[prefix + "nin_shortcut.bias"]   = nin_shortcut_b;
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
+        // z: [N, in_channels, h, w]
+
+        auto h = ggml_nn_group_norm(ctx, z, norm1_w, norm1_b);
+        h      = ggml_silu_inplace(ctx, h);
+        h      = ggml_nn_conv_2d(ctx, h, conv1_w, conv1_b, 1, 1, 1, 1);  // [N, out_channels, h, w]
+        h      = ggml_nn_group_norm(ctx, h, norm2_w, norm2_b);
+        h      = ggml_silu_inplace(ctx, h);
+        // dropout, skip for inference
+        h = ggml_nn_conv_2d(ctx, h, conv2_w, conv2_b, 1, 1, 1, 1);  // [N, out_channels, h, w]
+
+        // skip connection
+        if (out_channels != in_channels) {
+            z = ggml_nn_conv_2d(ctx, z, nin_shortcut_w, nin_shortcut_b);  // [N, out_channels, h, w]
+        }
+
+        h = ggml_add(ctx, h, z);
+        return h;  // [N, out_channels, h, w]
+    }
+};
+
+struct AttnBlock {
+    int in_channels;  // mult * model_channels
+
+    // group norm
+    struct ggml_tensor* norm_w;  // [in_channels,]
+    struct ggml_tensor* norm_b;  // [in_channels,]
+
+    // q/k/v
+    struct ggml_tensor* q_w;  // [in_channels, in_channels, 1, 1]
+    struct ggml_tensor* q_b;  // [in_channels,]
+    struct ggml_tensor* k_w;  // [in_channels, in_channels, 1, 1]
+    struct ggml_tensor* k_b;  // [in_channels,]
+    struct ggml_tensor* v_w;  // [in_channels, in_channels, 1, 1]
+    struct ggml_tensor* v_b;  // [in_channels,]
+
+    // proj_out
+    struct ggml_tensor* proj_out_w;  // [in_channels, in_channels, 1, 1]
+    struct ggml_tensor* proj_out_b;  // [in_channels,]
+
+    struct ggml_tensor* attn_scale;
+
+    size_t calculate_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        mem_size += 6 * in_channels * ggml_type_sizef(GGML_TYPE_F32);                        // norm_w/norm_b/q_b/k_v/v_b/proj_out_b
+        mem_size += 4 * in_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // q_w/k_w/v_w/proj_out_w                                            // object overhead
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
+        norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        q_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        k_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+        v_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
+        proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
+
+        attn_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+        ggml_allocr_alloc(alloc, attn_scale);
+        float scale = 1.0f / sqrt((float)in_channels);
+        ggml_backend_tensor_set(attn_scale, &scale, 0, sizeof(scale));
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "norm.weight"]     = norm_w;
+        tensors[prefix + "norm.bias"]       = norm_b;
+        tensors[prefix + "q.weight"]        = q_w;
+        tensors[prefix + "q.bias"]          = q_b;
+        tensors[prefix + "k.weight"]        = k_w;
+        tensors[prefix + "k.bias"]          = k_b;
+        tensors[prefix + "v.weight"]        = v_w;
+        tensors[prefix + "v.bias"]          = v_b;
+        tensors[prefix + "proj_out.weight"] = proj_out_w;
+        tensors[prefix + "proj_out.bias"]   = proj_out_b;
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+
+        auto h_ = ggml_nn_group_norm(ctx, x, norm_w, norm_b);
+
+        const int64_t n = h_->ne[3];
+        const int64_t c = h_->ne[2];
+        const int64_t h = h_->ne[1];
+        const int64_t w = h_->ne[0];
+
+        auto q = ggml_nn_conv_2d(ctx, h_, q_w, q_b);  // [N, in_channels, h, w]
+        auto k = ggml_nn_conv_2d(ctx, h_, k_w, k_b);  // [N, in_channels, h, w]
+        auto v = ggml_nn_conv_2d(ctx, h_, v_w, v_b);  // [N, in_channels, h, w]
+
+        q = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3));  // [N, h, w, in_channels]
+        q = ggml_reshape_3d(ctx, q, c, h * w, n);              // [N, h * w, in_channels]
+
+        k = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3));  // [N, h, w, in_channels]
+        k = ggml_reshape_3d(ctx, k, c, h * w, n);              // [N, h * w, in_channels]
+
+        auto w_ = ggml_mul_mat(ctx, k, q);  // [N, h * w, h * w]
+        w_      = ggml_scale_inplace(ctx, w_, attn_scale);
+        w_      = ggml_soft_max_inplace(ctx, w_);
+
+        v  = ggml_reshape_3d(ctx, v, h * w, c, n);               // [N, in_channels, h * w]
+        h_ = ggml_mul_mat(ctx, v, w_);                           // [N, h * w, in_channels]
+        h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3));  // [N, in_channels, h * w]
+        h_ = ggml_reshape_4d(ctx, h_, w, h, c, n);               // [N, in_channels, h, w]
+
+        // proj_out
+        h_ = ggml_nn_conv_2d(ctx, h_, proj_out_w, proj_out_b);  // [N, in_channels, h, w]
+
+        h_ = ggml_add(ctx, h_, x);
+        return h_;
+    }
+};
+
+// ldm.modules.diffusionmodules.model.Encoder
+struct Encoder {
+    int embed_dim      = 4;
+    int ch             = 128;
+    int z_channels     = 4;
+    int in_channels    = 3;
+    int num_res_blocks = 2;
+    int ch_mult[4]     = {1, 2, 4, 4};
+
+    struct ggml_tensor* conv_in_w;  // [ch, in_channels, 3, 3]
+    struct ggml_tensor* conv_in_b;  // [ch, ]
+
+    ResnetBlock down_blocks[4][2];
+    DownSample down_samples[3];
+
+    struct
+    {
+        ResnetBlock block_1;
+        AttnBlock attn_1;
+        ResnetBlock block_2;
+    } mid;
+
+    // block_in = ch * ch_mult[len_mults - 1]
+    struct ggml_tensor* norm_out_w;  // [block_in, ]
+    struct ggml_tensor* norm_out_b;  // [block_in, ]
+
+    struct ggml_tensor* conv_out_w;  // [embed_dim*2, block_in, 3, 3]
+    struct ggml_tensor* conv_out_b;  // [embed_dim*2, ]
+
+    Encoder() {
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+
+        int block_in = 1;
+        for (int i = 0; i < len_mults; i++) {
+            if (i == 0) {
+                block_in = ch;
+            } else {
+                block_in = ch * ch_mult[i - 1];
+            }
+            int block_out = ch * ch_mult[i];
+            for (int j = 0; j < num_res_blocks; j++) {
+                down_blocks[i][j].in_channels  = block_in;
+                down_blocks[i][j].out_channels = block_out;
+                block_in                       = block_out;
+            }
+            if (i != len_mults - 1) {
+                down_samples[i].channels       = block_in;
+                down_samples[i].out_channels   = block_in;
+                down_samples[i].vae_downsample = true;
+            }
+        }
+
+        mid.block_1.in_channels  = block_in;
+        mid.block_1.out_channels = block_in;
+        mid.attn_1.in_channels   = block_in;
+        mid.block_2.in_channels  = block_in;
+        mid.block_2.out_channels = block_in;
+    }
+
+    size_t get_num_tensors() {
+        int num_tensors = 6;
+
+        // mid
+        num_tensors += 10 * 3;
+
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                num_tensors += 10;
+            }
+
+            if (i != 0) {
+                num_tensors += 2;
+            }
+        }
+        return num_tensors;
+    }
+
+    size_t calculate_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        int len_mults   = sizeof(ch_mult) / sizeof(int);
+        int block_in    = ch * ch_mult[len_mults - 1];
+
+        mem_size += ch * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv_in_w
+        mem_size += ch * ggml_type_sizef(GGML_TYPE_F32);                        // conv_in_b
+
+        mem_size += 2 * block_in * ggml_type_sizef(GGML_TYPE_F32);  // norm_out_w/b
+
+        mem_size += z_channels * 2 * block_in * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv_out_w
+        mem_size += z_channels * 2 * ggml_type_sizef(GGML_TYPE_F32);                     // conv_out_b
+
+        mem_size += mid.block_1.calculate_mem_size(wtype);
+        mem_size += mid.attn_1.calculate_mem_size(wtype);
+        mem_size += mid.block_2.calculate_mem_size(wtype);
+
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                mem_size += down_blocks[i][j].calculate_mem_size(wtype);
+            }
+            if (i != 0) {
+                mem_size += down_samples[i - 1].calculate_mem_size(wtype);
+            }
+        }
+
+        return static_cast<size_t>(mem_size);
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        int block_in  = ch * ch_mult[len_mults - 1];
+
+        conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, ch);
+        conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch);
+
+        norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
+        norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
+
+        conv_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, block_in, z_channels * 2);
+        conv_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_channels * 2);
+
+        mid.block_1.init_params(ctx, wtype);
+        mid.attn_1.init_params(ctx, alloc, wtype);
+        mid.block_2.init_params(ctx, wtype);
+
+        for (int i = 0; i < len_mults; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                down_blocks[i][j].init_params(ctx, wtype);
+            }
+            if (i != len_mults - 1) {
+                down_samples[i].init_params(ctx, wtype);
+            }
+        }
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "norm_out.weight"] = norm_out_w;
+        tensors[prefix + "norm_out.bias"]   = norm_out_b;
+        tensors[prefix + "conv_in.weight"]  = conv_in_w;
+        tensors[prefix + "conv_in.bias"]    = conv_in_b;
+        tensors[prefix + "conv_out.weight"] = conv_out_w;
+        tensors[prefix + "conv_out.bias"]   = conv_out_b;
+
+        mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
+        mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
+        mid.block_2.map_by_name(tensors, prefix + "mid.block_2.");
+
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        for (int i = 0; i < len_mults; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                down_blocks[i][j].map_by_name(tensors, prefix + "down." + std::to_string(i) + ".block." + std::to_string(j) + ".");
+            }
+            if (i != len_mults - 1) {
+                down_samples[i].map_by_name(tensors, prefix + "down." + std::to_string(i) + ".downsample.");
+            }
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+
+        // conv_in
+        auto h = ggml_nn_conv_2d(ctx, x, conv_in_w, conv_in_b, 1, 1, 1, 1);  // [N, ch, h, w]
+        ggml_set_name(h, "b-start");
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        for (int i = 0; i < len_mults; i++) {
+            for (int j = 0; j < num_res_blocks; j++) {
+                h = down_blocks[i][j].forward(ctx, h);
+            }
+            if (i != len_mults - 1) {
+                h = down_samples[i].forward(ctx, h);
+            }
+        }
+
+        h = mid.block_1.forward(ctx, h);
+        h = mid.attn_1.forward(ctx, h);
+        h = mid.block_2.forward(ctx, h);  // [N, block_in, h, w]
+
+        h = ggml_nn_group_norm(ctx, h, norm_out_w, norm_out_b);
+        h = ggml_silu_inplace(ctx, h);
+
+        // conv_out
+        h = ggml_nn_conv_2d(ctx, h, conv_out_w, conv_out_b, 1, 1, 1, 1);  // [N, z_channels*2, h, w]
+
+        return h;
+    }
+};
+
+// ldm.modules.diffusionmodules.model.Decoder
+struct Decoder {
+    int embed_dim      = 4;
+    int ch             = 128;
+    int z_channels     = 4;
+    int out_ch         = 3;
+    int num_res_blocks = 2;
+    int ch_mult[4]     = {1, 2, 4, 4};
+
+    // block_in = ch *  ch_mult[-1], 512
+    struct ggml_tensor* conv_in_w;  // [block_in, z_channels, 3, 3]
+    struct ggml_tensor* conv_in_b;  // [block_in, ]
+
+    struct
+    {
+        ResnetBlock block_1;
+        AttnBlock attn_1;
+        ResnetBlock block_2;
+    } mid;
+
+    ResnetBlock up_blocks[4][3];
+    UpSample up_samples[3];
+
+    struct ggml_tensor* norm_out_w;  // [ch *  ch_mult[0], ]
+    struct ggml_tensor* norm_out_b;  // [ch *  ch_mult[0], ]
+
+    struct ggml_tensor* conv_out_w;  // [out_ch, ch *  ch_mult[0], 3, 3]
+    struct ggml_tensor* conv_out_b;  // [out_ch, ]
+
+    Decoder() {
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        int block_in  = ch * ch_mult[len_mults - 1];
+
+        mid.block_1.in_channels  = block_in;
+        mid.block_1.out_channels = block_in;
+        mid.attn_1.in_channels   = block_in;
+        mid.block_2.in_channels  = block_in;
+        mid.block_2.out_channels = block_in;
+
+        for (int i = len_mults - 1; i >= 0; i--) {
+            int mult      = ch_mult[i];
+            int block_out = ch * mult;
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                up_blocks[i][j].in_channels  = block_in;
+                up_blocks[i][j].out_channels = block_out;
+                block_in                     = block_out;
+            }
+            if (i != 0) {
+                up_samples[i - 1].channels     = block_in;
+                up_samples[i - 1].out_channels = block_in;
+            }
+        }
+    }
+
+    size_t calculate_mem_size(ggml_type wtype) {
+        double mem_size = 0;
+        int len_mults   = sizeof(ch_mult) / sizeof(int);
+        int block_in    = ch * ch_mult[len_mults - 1];
+
+        mem_size += block_in * z_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv_in_w
+        mem_size += block_in * ggml_type_sizef(GGML_TYPE_F32);                       // conv_in_b
+
+        mem_size += 2 * (ch * ch_mult[0]) * ggml_type_sizef(GGML_TYPE_F32);  // norm_out_w/b
+
+        mem_size += (ch * ch_mult[0]) * out_ch * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // conv_out_w
+        mem_size += out_ch * ggml_type_sizef(GGML_TYPE_F32);                              // conv_out_b
+
+        mem_size += mid.block_1.calculate_mem_size(wtype);
+        mem_size += mid.attn_1.calculate_mem_size(wtype);
+        mem_size += mid.block_2.calculate_mem_size(wtype);
+
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                mem_size += up_blocks[i][j].calculate_mem_size(wtype);
+            }
+            if (i != 0) {
+                mem_size += up_samples[i - 1].calculate_mem_size(wtype);
+            }
+        }
+
+        return static_cast<size_t>(mem_size);
+    }
+
+    size_t get_num_tensors() {
+        int num_tensors = 8;
+
+        // mid
+        num_tensors += 10 * 3;
+
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                num_tensors += 10;
+            }
+
+            if (i != 0) {
+                num_tensors += 2;
+            }
+        }
+        return num_tensors;
+    }
+
+    void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        int block_in  = ch * ch_mult[len_mults - 1];
+
+        norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
+        norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
+
+        conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, z_channels, block_in);
+        conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
+
+        conv_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, ch * ch_mult[0], out_ch);
+        conv_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_ch);
+
+        mid.block_1.init_params(ctx, wtype);
+        mid.attn_1.init_params(ctx, alloc, wtype);
+        mid.block_2.init_params(ctx, wtype);
+
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                up_blocks[i][j].init_params(ctx, wtype);
+            }
+
+            if (i != 0) {
+                up_samples[i - 1].init_params(ctx, wtype);
+            }
+        }
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "norm_out.weight"] = norm_out_w;
+        tensors[prefix + "norm_out.bias"]   = norm_out_b;
+        tensors[prefix + "conv_in.weight"]  = conv_in_w;
+        tensors[prefix + "conv_in.bias"]    = conv_in_b;
+        tensors[prefix + "conv_out.weight"] = conv_out_w;
+        tensors[prefix + "conv_out.bias"]   = conv_out_b;
+
+        mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
+        mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
+        mid.block_2.map_by_name(tensors, prefix + "mid.block_2.");
+
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                up_blocks[i][j].map_by_name(tensors, prefix + "up." + std::to_string(i) + ".block." + std::to_string(j) + ".");
+            }
+            if (i != 0) {
+                up_samples[i - 1].map_by_name(tensors, prefix + "up." + std::to_string(i) + ".upsample.");
+            }
+        }
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
+        // z: [N, z_channels, h, w]
+        // conv_in
+        auto h = ggml_nn_conv_2d(ctx, z, conv_in_w, conv_in_b, 1, 1, 1, 1);  // [N, block_in, h, w]
+
+        h = mid.block_1.forward(ctx, h);
+        h = mid.attn_1.forward(ctx, h);
+        h = mid.block_2.forward(ctx, h);  // [N, block_in, h, w]
+
+        int len_mults = sizeof(ch_mult) / sizeof(int);
+        for (int i = len_mults - 1; i >= 0; i--) {
+            for (int j = 0; j < num_res_blocks + 1; j++) {
+                h = up_blocks[i][j].forward(ctx, h);
+            }
+            if (i != 0) {
+                h = up_samples[i - 1].forward(ctx, h);
+            }
+        }
+
+        // group norm 32
+        h = ggml_nn_group_norm(ctx, h, norm_out_w, norm_out_b);
+        h = ggml_silu_inplace(ctx, h);
+
+        // conv_out
+        h = ggml_nn_conv_2d(ctx, h, conv_out_w, conv_out_b, 1, 1, 1, 1);  // [N, out_ch, h, w]
+        return h;
+    }
+};
+
+// ldm.models.autoencoder.AutoencoderKL
+struct AutoEncoderKL : public GGMLModule {
+    bool decode_only = true;
+    int embed_dim    = 4;
+    struct {
+        int z_channels     = 4;
+        int resolution     = 256;
+        int in_channels    = 3;
+        int out_ch         = 3;
+        int ch             = 128;
+        int ch_mult[4]     = {1, 2, 4, 4};
+        int num_res_blocks = 2;
+    } dd_config;
+
+    struct ggml_tensor* quant_conv_w;  // [2*embed_dim, 2*z_channels, 1, 1]
+    struct ggml_tensor* quant_conv_b;  // [2*embed_dim, ]
+
+    struct ggml_tensor* post_quant_conv_w;  // [z_channels, embed_dim, 1, 1]
+    struct ggml_tensor* post_quant_conv_b;  // [z_channels, ]
+
+    Encoder encoder;
+    Decoder decoder;
+
+    AutoEncoderKL(bool decode_only = false)
+        : decode_only(decode_only) {
+        name = "vae";
+        assert(sizeof(dd_config.ch_mult) == sizeof(encoder.ch_mult));
+        assert(sizeof(dd_config.ch_mult) == sizeof(decoder.ch_mult));
+
+        encoder.embed_dim      = embed_dim;
+        decoder.embed_dim      = embed_dim;
+        encoder.ch             = dd_config.ch;
+        decoder.ch             = dd_config.ch;
+        encoder.z_channels     = dd_config.z_channels;
+        decoder.z_channels     = dd_config.z_channels;
+        encoder.in_channels    = dd_config.in_channels;
+        decoder.out_ch         = dd_config.out_ch;
+        encoder.num_res_blocks = dd_config.num_res_blocks;
+
+        int len_mults = sizeof(dd_config.ch_mult) / sizeof(int);
+        for (int i = 0; i < len_mults; i++) {
+            encoder.ch_mult[i] = dd_config.ch_mult[i];
+            decoder.ch_mult[i] = dd_config.ch_mult[i];
+        }
+    }
+
+    size_t calculate_mem_size() {
+        double mem_size = 0;
+
+        if (!decode_only) {
+            mem_size += 2 * embed_dim * 2 * dd_config.z_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // quant_conv_w
+            mem_size += 2 * embed_dim * ggml_type_sizef(GGML_TYPE_F32);                                     // quant_conv_b
+            mem_size += encoder.calculate_mem_size(wtype);
+        }
+
+        mem_size += dd_config.z_channels * embed_dim * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // post_quant_conv_w
+        mem_size += dd_config.z_channels * ggml_type_sizef(GGML_TYPE_F32);                      // post_quant_conv_b
+
+        mem_size += decoder.calculate_mem_size(wtype);
+        return static_cast<size_t>(mem_size);
+    }
+
+    size_t get_num_tensors() {
+        size_t num_tensors = decoder.get_num_tensors();
+        if (!decode_only) {
+            num_tensors += 2;
+            num_tensors += encoder.get_num_tensors();
+        }
+        return num_tensors;
+    }
+
+    void init_params() {
+        ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
+
+        if (!decode_only) {
+            quant_conv_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 1, 1, 2 * dd_config.z_channels, 2 * embed_dim);
+            quant_conv_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, 2 * embed_dim);
+            encoder.init_params(params_ctx, alloc, wtype);
+        }
+
+        post_quant_conv_w = ggml_new_tensor_4d(params_ctx, GGML_TYPE_F16, 1, 1, embed_dim, dd_config.z_channels);
+        post_quant_conv_b = ggml_new_tensor_1d(params_ctx, GGML_TYPE_F32, dd_config.z_channels);
+        decoder.init_params(params_ctx, alloc, wtype);
+
+        // alloc all tensors linked to this context
+        for (struct ggml_tensor* t = ggml_get_first_tensor(params_ctx); t != NULL; t = ggml_get_next_tensor(params_ctx, t)) {
+            if (t->data == NULL) {
+                ggml_allocr_alloc(alloc, t);
+            }
+        }
+        ggml_allocr_free(alloc);
+    }
+
+    void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        tensors[prefix + "quant_conv.weight"] = quant_conv_w;
+        tensors[prefix + "quant_conv.bias"]   = quant_conv_b;
+        encoder.map_by_name(tensors, prefix + "encoder.");
+
+        tensors[prefix + "post_quant_conv.weight"] = post_quant_conv_w;
+        tensors[prefix + "post_quant_conv.bias"]   = post_quant_conv_b;
+        decoder.map_by_name(tensors, prefix + "decoder.");
+    }
+
+    struct ggml_tensor* decode(struct ggml_context* ctx0, struct ggml_tensor* z) {
+        // z: [N, z_channels, h, w]
+        // post_quant_conv
+        auto h = ggml_nn_conv_2d(ctx0, z, post_quant_conv_w, post_quant_conv_b);  // [N, z_channels, h, w]
+        ggml_set_name(h, "bench-start");
+        h = decoder.forward(ctx0, h);
+        ggml_set_name(h, "bench-end");
+        return h;
+    }
+
+    struct ggml_tensor* encode(struct ggml_context* ctx0, struct ggml_tensor* x) {
+        // x: [N, in_channels, h, w]
+        auto h = encoder.forward(ctx0, x);  // [N, 2*z_channels, h/8, w/8]
+        // quant_conv
+        h = ggml_nn_conv_2d(ctx0, h, quant_conv_w, quant_conv_b);  // [N, 2*embed_dim, h/8, w/8]
+        ggml_set_name(h, "b-end");
+        return h;
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
+        // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+        static size_t buf_size = ggml_tensor_overhead() * VAE_GRAPH_SIZE + ggml_graph_overhead();
+        static std::vector<uint8_t> buf(buf_size);
+
+        struct ggml_init_params params = {
+            /*.mem_size   =*/buf_size,
+            /*.mem_buffer =*/buf.data(),
+            /*.no_alloc   =*/true,  // the tensors will be allocated later by ggml_allocr_alloc_graph()
+        };
+        // LOG_DEBUG("mem_size %u ", params.mem_size);
+
+        struct ggml_context* ctx0 = ggml_init(params);
+
+        struct ggml_cgraph* gf = ggml_new_graph(ctx0);
+
+        struct ggml_tensor* z_ = NULL;
+
+        // it's performing a compute, check if backend isn't cpu
+        if (!ggml_backend_is_cpu(backend)) {
+            // pass input tensors to gpu memory
+            z_ = ggml_dup_tensor(ctx0, z);
+            ggml_allocr_alloc(compute_allocr, z_);
+
+            // pass data to device backend
+            if (!ggml_allocr_is_measure(compute_allocr)) {
+                ggml_backend_tensor_set(z_, z->data, 0, ggml_nbytes(z));
+            }
+        } else {
+            z_ = z;
+        }
+
+        struct ggml_tensor* out = decode_graph ? decode(ctx0, z_) : encode(ctx0, z_);
+
+        ggml_build_forward_expand(gf, out);
+        ggml_free(ctx0);
+
+        return gf;
+    }
+
+    void alloc_compute_buffer(struct ggml_tensor* x, bool decode) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(x, decode);
+        };
+        GGMLModule::alloc_compute_buffer(get_graph);
+    }
+
+    void compute(struct ggml_tensor* work_result, const int n_threads, struct ggml_tensor* z, bool decode_graph) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_graph(z, decode_graph);
+        };
+
+        GGMLModule::compute(get_graph, n_threads, work_result);
+    }
+};
+
+#endif
--- a/vocab.hpp
+++ b/vocab.hpp