feat: load weights from safetensors and ckpt (#101)

2023-12-03 15:47:20 +08:00
parent 47dd704198
commit d7af2c2ba9
28 changed files with 49180 additions and 2415 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -8,5 +8,6 @@ test/
 *.bin
 *.exe
 *.gguf
+*.log
 output.png
-models/*
+models/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@ endif()
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
 option(SD_CUBLAS                     "sd: cuda backend" OFF)
-option(SD_FLASH_ATTN            "sd: use flash attention for x4 less memory usage" OFF)
+option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
 option(BUILD_SHARED_LIBS             "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)

@@ -45,14 +45,15 @@ set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 # deps
 add_subdirectory(ggml)

+add_subdirectory(thirdparty)
+
 set(SD_LIB stable-diffusion)

-add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp)
-target_link_libraries(${SD_LIB} PUBLIC ggml)
-target_include_directories(${SD_LIB} PUBLIC .)
+add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp model.h model.cpp util.h util.cpp)
+target_link_libraries(${SD_LIB} PUBLIC ggml zip)
+target_include_directories(${SD_LIB} PUBLIC . thirdparty)
 target_compile_features(${SD_LIB} PUBLIC cxx_std_11)

-add_subdirectory(common)

 if (SD_BUILD_EXAMPLES)
    add_subdirectory(examples)
--- a/README.md
+++ b/README.md
@@ -10,13 +10,15 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in

 - Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
 - Super lightweight and without external dependencies.
+- SD1.x and SD2.x support
 - 16-bit, 32-bit float support
 - 4-bit, 5-bit and 8-bit integer quantization support
 - Accelerated memory-efficient CPU inference
    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
 - AVX, AVX2 and AVX512 support for x86 architectures
- SD1.x and SD2.x support
 - Full CUDA backend for GPU acceleration, for now just for float16 and float32 models. There are some issues with quantized models and CUDA; it will be fixed in the future.
+- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models.
+    - No need to convert to `.ggml` or `.gguf` anymore!
 - Flash Attention for memory usage optimization (only cpu for now).
 - Original `txt2img` and `img2img` mode
 - Negative prompt
@@ -68,7 +70,7 @@ git submodule init
 git submodule update
 ```

-### Convert weights
+### Download weights

 - download original weights(.ckpt or .safetensors). For example
    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
@@ -81,22 +83,6 @@ git submodule update
    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-nonema-pruned.safetensors
    ```

- convert weights to gguf model format
-
-    ```shell
-    ./bin/convert sd-v1-4.ckpt -t f16
-    ```
-
-### Quantization
-
-You can specify the output model format using the `--type` or `-t` parameter
-
- `f16` for 16-bit floating-point
- `f32` for 32-bit floating-point
- `q8_0` for 8-bit integer quantization
- `q5_0` or `q5_1` for 5-bit integer quantization
- `q4_0` or `q4_1` for 4-bit integer quantization
-
 ### Build

 #### Build from scratch
@@ -144,9 +130,11 @@ arguments:
  -t, --threads N                    number of threads to use during computation (default: -1).
                                     If threads <= 0, then threads will be set to the number of CPU physical cores
  -m, --model [MODEL]                path to model
-  --lora-model-dir [DIR]             lora model directory
+  --vae [VAE]                        path to vae
+  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
+                                     If not specified, the default is the type of the weight file.  --lora-model-dir [DIR]             lora model directory  
  -i, --init-img [IMAGE]             path to the input image, required by img2img
-  -o, --output OUTPUT                path to write result image to (default: .\output.png)
+  -o, --output OUTPUT                path to write result image to (default: ./output.png)
  -p, --prompt [PROMPT]              the prompt to render
  -n, --negative-prompt PROMPT       the negative prompt (default: "")
  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
@@ -164,10 +152,21 @@ arguments:
  -v, --verbose                      print extra info
 ```

+#### Quantization
+
+You can specify the model weight type using the `--type` parameter. The weights are automatically converted when loading the model.
+
+- `f16` for 16-bit floating-point
+- `f32` for 32-bit floating-point
+- `q8_0` for 8-bit integer quantization
+- `q5_0` or `q5_1` for 5-bit integer quantization
+- `q4_0` or `q4_1` for 4-bit integer quantization
+
 #### txt2img example

-```
-./bin/sd -m ../sd-v1-4-f16.gguf -p "a lovely cat"
+```sh
+./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
+# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
 ```

 Using formats of different precisions will yield results of varying quality.
@@ -182,7 +181,7 @@ Using formats of different precisions will yield results of varying quality.


 ```
-./bin/sd --mode img2img -m ../models/sd-v1-4-f16.gguf -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
 ```

 <p align="center">
@@ -191,13 +190,6 @@ Using formats of different precisions will yield results of varying quality.

 #### with LoRA

- convert lora weights to gguf model format
-
-    ```shell
-    bin/convert [lora path] -t f16
-    # For example,  bin/convert marblesh.safetensors -t f16
-    ```
-
 - You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.

 - LoRA is specified via prompt, just like [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora).
@@ -205,10 +197,10 @@ Using formats of different precisions will yield results of varying quality.
 Here's a simple example:

 ```
-./bin/sd -m ../models/v1-5-pruned-emaonly-f16.gguf -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
 ```

-`../models/marblesh.gguf` will be applied to the model
+`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model

 #### LCM/LCM-LoRA

@@ -219,7 +211,7 @@ Here's a simple example:
 Here's a simple example:

 ```
-./bin/sd -m ../models/v1-5-pruned-emaonly-f16.gguf -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
 ```

 | without LCM-LoRA (--cfg-scale 7)  | with LCM-LoRA (--cfg-scale 1)  |
@@ -240,14 +232,13 @@ docker build -t sd .
 ```shell
 docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
 # For example
-# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4-f16.gguf -p "a lovely cat" -v -o /output/output.png
+# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
 ```

-## Memory/Disk Requirements
+## Memory Requirements

 | precision | f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
 | ----         | ----  |----  |----  |----  |----  |----  |----  |
-|  **Disk**        | 2.7G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
 |  **Memory** (txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
 |  **Memory** (txt2img - 512 x 512) *with Flash Attention* | ~2.4G | ~1.9G | ~1.6G | ~1.5G | ~1.5G | ~1.5G | ~1.5G |

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,15 +0,0 @@
-set(TARGET common)
-
-# json.hpp library from: https://github.com/nlohmann/json
-
-add_library(${TARGET} OBJECT common.cpp common.h stb_image.h stb_image_write.h json.hpp)
-
-target_include_directories(${TARGET} PUBLIC .)
-target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-
-# ZIP Library from: https://github.com/kuba--/zip
-
-set(Z_TARGET zip)
-add_library(${Z_TARGET} OBJECT zip.c zip.h miniz.h)
-target_include_directories(${Z_TARGET} PUBLIC .)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,391 +0,0 @@
-#include "common.h"
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <thread>
-#include <unordered_set>
-#include <vector>
-
-#if defined(__APPLE__) && defined(__MACH__)
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif
-
-#if !defined(_WIN32)
-#include <sys/ioctl.h>
-#include <unistd.h>
-#endif
-
-// get_num_physical_cores is copy from
-// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
-// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
-int32_t get_num_physical_cores() {
-#ifdef __linux__
-    // enumerate the set of thread siblings, num entries is num cores
-    std::unordered_set<std::string> siblings;
-    for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
-        std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
-        if (!thread_siblings.is_open()) {
-            break;  // no more cpus
-        }
-        std::string line;
-        if (std::getline(thread_siblings, line)) {
-            siblings.insert(line);
-        }
-    }
-    if (siblings.size() > 0) {
-        return static_cast<int32_t>(siblings.size());
-    }
-#elif defined(__APPLE__) && defined(__MACH__)
-    int32_t num_physical_cores;
-    size_t len = sizeof(num_physical_cores);
-    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
-    if (result == 0) {
-        return num_physical_cores;
-    }
-    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
-    if (result == 0) {
-        return num_physical_cores;
-    }
-#elif defined(_WIN32)
-    // TODO: Implement
-#endif
-    unsigned int n_threads = std::thread::hardware_concurrency();
-    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
-}
-
-const char* rng_type_to_str[] = {
-    "std_default",
-    "cuda",
-};
-
-// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
-const char* sample_method_str[] = {
-    "euler_a",
-    "euler",
-    "heun",
-    "dpm2",
-    "dpm++2s_a",
-    "dpm++2m",
-    "dpm++2mv2",
-    "lcm",
-};
-
-// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char* schedule_str[] = {
-    "default",
-    "discrete",
-    "karras"};
-
-const char* modes_str[] = {
-    "txt2img",
-    "img2img"};
-
-void print_params(SDParams params) {
-    printf("Option: \n");
-    printf("    n_threads:       %d\n", params.n_threads);
-    printf("    mode:            %s\n", modes_str[params.mode]);
-    printf("    model_path:      %s\n", params.model_path.c_str());
-    printf("    output_path:     %s\n", params.output_path.c_str());
-    printf("    init_img:        %s\n", params.input_path.c_str());
-    printf("    prompt:          %s\n", params.prompt.c_str());
-    printf("    negative_prompt: %s\n", params.negative_prompt.c_str());
-    printf("    cfg_scale:       %.2f\n", params.cfg_scale);
-    printf("    width:           %d\n", params.width);
-    printf("    height:          %d\n", params.height);
-    printf("    sample_method:   %s\n", sample_method_str[params.sample_method]);
-    printf("    schedule:        %s\n", schedule_str[params.schedule]);
-    printf("    sample_steps:    %d\n", params.sample_steps);
-    printf("    strength:        %.2f\n", params.strength);
-    printf("    rng:             %s\n", rng_type_to_str[params.rng_type]);
-    printf("    seed:            %ld\n", params.seed);
-    printf("    batch_count:     %d\n", params.batch_count);
-}
-
-void print_usage(int argc, const char* argv[]) {
-    printf("usage: %s [arguments]\n", argv[0]);
-    printf("\n");
-    printf("arguments:\n");
-    printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [txt2img or img2img]    generation mode (default: txt2img)\n");
-    printf("  -t, --threads N                    number of threads to use during computation (default: -1).\n");
-    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
-    printf("  -m, --model [MODEL]                path to model\n");
-    printf("  --lora-model-dir [DIR]             lora model directory\n");
-    printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
-    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
-    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
-    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
-    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
-    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
-    printf("                                     1.0 corresponds to full destruction of information in init image\n");
-    printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
-    printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
-    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
-    printf("                                     sampling method (default: \"euler_a\")\n");
-    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
-    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
-    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
-    printf("  -b, --batch-count COUNT            number of images to generate.\n");
-    printf("  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)\n");
-    printf("  -v, --verbose                      print extra info\n");
-}
-
-void parse_args(int argc, const char** argv, SDParams& params) {
-    bool invalid_arg = false;
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-M" || arg == "--mode") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* mode_selected = argv[i];
-            int mode_found            = -1;
-            for (int d = 0; d < MODE_COUNT; d++) {
-                if (!strcmp(mode_selected, modes_str[d])) {
-                    mode_found = d;
-                }
-            }
-            if (mode_found == -1) {
-                fprintf(stderr, "error: invalid mode %s, must be one of [txt2img, img2img]\n",
-                        mode_selected);
-                exit(1);
-            }
-            params.mode = (sd_mode)mode_found;
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.model_path = argv[i];
-        } else if (arg == "--lora-model-dir") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.lora_model_dir = argv[i];
-        } else if (arg == "-i" || arg == "--init-img") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.input_path = argv[i];
-        } else if (arg == "-o" || arg == "--output") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.output_path = argv[i];
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.prompt = argv[i];
-        } else if (arg == "-n" || arg == "--negative-prompt") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.negative_prompt = argv[i];
-        } else if (arg == "--cfg-scale") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.cfg_scale = std::stof(argv[i]);
-        } else if (arg == "--strength") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.strength = std::stof(argv[i]);
-        } else if (arg == "-H" || arg == "--height") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.height = std::stoi(argv[i]);
-        } else if (arg == "-W" || arg == "--width") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.width = std::stoi(argv[i]);
-        } else if (arg == "--steps") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.sample_steps = std::stoi(argv[i]);
-        } else if (arg == "-b" || arg == "--batch-count") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.batch_count = std::stoi(argv[i]);
-        } else if (arg == "--rng") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            std::string rng_type_str = argv[i];
-            if (rng_type_str == "std_default") {
-                params.rng_type = STD_DEFAULT_RNG;
-            } else if (rng_type_str == "cuda") {
-                params.rng_type = CUDA_RNG;
-            } else {
-                invalid_arg = true;
-                break;
-            }
-        } else if (arg == "--schedule") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* schedule_selected = argv[i];
-            int schedule_found            = -1;
-            for (int d = 0; d < N_SCHEDULES; d++) {
-                if (!strcmp(schedule_selected, schedule_str[d])) {
-                    schedule_found = d;
-                }
-            }
-            if (schedule_found == -1) {
-                invalid_arg = true;
-                break;
-            }
-            params.schedule = (Schedule)schedule_found;
-        } else if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.seed = std::stoll(argv[i]);
-        } else if (arg == "--sampling-method") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            const char* sample_method_selected = argv[i];
-            int sample_method_found            = -1;
-            for (int m = 0; m < N_SAMPLE_METHODS; m++) {
-                if (!strcmp(sample_method_selected, sample_method_str[m])) {
-                    sample_method_found = m;
-                }
-            }
-            if (sample_method_found == -1) {
-                invalid_arg = true;
-                break;
-            }
-            params.sample_method = (SampleMethod)sample_method_found;
-        } else if (arg == "-h" || arg == "--help") {
-            print_usage(argc, argv);
-            exit(0);
-        } else if (arg == "-v" || arg == "--verbose") {
-            params.verbose = true;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            print_usage(argc, argv);
-            exit(1);
-        }
-    }
-    if (invalid_arg) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        print_usage(argc, argv);
-        exit(1);
-    }
-    if (params.n_threads <= 0) {
-        params.n_threads = get_num_physical_cores();
-    }
-
-    if (params.prompt.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: prompt\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
-    if (params.model_path.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: model_path\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
-    if (params.mode == IMG2IMG && params.input_path.length() == 0) {
-        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
-    if (params.output_path.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: output_path\n");
-        print_usage(argc, argv);
-        exit(1);
-    }
-
-    if (params.width <= 0 || params.width % 64 != 0) {
-        fprintf(stderr, "error: the width must be a multiple of 64\n");
-        exit(1);
-    }
-
-    if (params.height <= 0 || params.height % 64 != 0) {
-        fprintf(stderr, "error: the height must be a multiple of 64\n");
-        exit(1);
-    }
-
-    if (params.sample_steps <= 0) {
-        fprintf(stderr, "error: the sample_steps must be greater than 0\n");
-        exit(1);
-    }
-
-    if (params.strength < 0.f || params.strength > 1.f) {
-        fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
-        exit(1);
-    }
-
-    if (params.seed < 0) {
-        srand((int)time(NULL));
-        params.seed = rand();
-    }
-}
-
-std::string basename(const std::string& path) {
-    size_t pos = path.find_last_of('/');
-    if (pos != std::string::npos) {
-        return path.substr(pos + 1);
-    }
-    pos = path.find_last_of('\\');
-    if (pos != std::string::npos) {
-        return path.substr(pos + 1);
-    }
-    return path;
-}
-
-std::string get_image_params(SDParams params, int seed) {
-    std::string parameter_string = params.prompt + "\n";
-    if (params.negative_prompt.size() != 0) {
-        parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
-    }
-    parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
-    parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
-    parameter_string += "Seed: " + std::to_string(seed) + ", ";
-    parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
-    parameter_string += "Model: " + basename(params.model_path) + ", ";
-    parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
-    parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
-    if (params.schedule == KARRAS) {
-        parameter_string += " karras";
-    }
-    parameter_string += ", ";
-    parameter_string += "Version: stable-diffusion.cpp";
-    return parameter_string;
-}
--- a/common/common.h
+++ b/common/common.h
@@ -1,43 +0,0 @@
-#pragma once
-
-#include <string>
-#include "stable-diffusion.h"
-
-enum sd_mode {
-    TXT2IMG,
-    IMG2IMG,
-    MODE_COUNT
-};
-
-struct SDParams {
-    int n_threads = -1;
-    sd_mode mode  = TXT2IMG;
-
-    std::string model_path;
-    std::string lora_model_dir;
-    std::string output_path = "output.png";
-    std::string input_path;
-
-    std::string prompt;
-    std::string negative_prompt;
-    float cfg_scale = 7.0f;
-    int width       = 512;
-    int height      = 512;
-    int batch_count = 1;
-
-    SampleMethod sample_method = EULER_A;
-    Schedule schedule          = DEFAULT;
-    int sample_steps           = 20;
-    float strength             = 0.75f;
-    RNGType rng_type           = CUDA_RNG;
-    int64_t seed               = 42;
-    bool verbose               = false;
-};
-
-void print_params(SDParams params);
-
-void print_usage(int argc, const char* argv[]);
-
-void parse_args(int argc, const char** argv, SDParams& params);
-
-std::string get_image_params(SDParams params, int seed);
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,3 @@
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})

-add_subdirectory(cli)
-add_subdirectory(convert)
+add_subdirectory(cli)
--- a/examples/cli/CMakeLists.txt
+++ b/examples/cli/CMakeLists.txt
@@ -2,5 +2,5 @@ set(TARGET sd)

 add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE stable-diffusion common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -1,8 +1,9 @@
 #include <stdio.h>
 #include <ctime>
 #include <random>
-#include "common.h"
+#include "ggml/ggml.h"
 #include "stable-diffusion.h"
+#include "util.h"

 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
@@ -11,6 +12,405 @@
 #define STB_IMAGE_WRITE_STATIC
 #include "stb_image_write.h"

+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
+const char* rng_type_to_str[] = {
+    "std_default",
+    "cuda",
+};
+
+// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
+const char* sample_method_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2",
+    "lcm",
+};
+
+// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
+const char* schedule_str[] = {
+    "default",
+    "discrete",
+    "karras",
+};
+
+const char* modes_str[] = {
+    "txt2img",
+    "img2img",
+};
+
+enum SDMode {
+    TXT2IMG,
+    IMG2IMG,
+    MODE_COUNT
+};
+
+struct SDParams {
+    int n_threads = -1;
+    SDMode mode   = TXT2IMG;
+
+    std::string model_path;
+    std::string vae_path;
+    ggml_type wtype = GGML_TYPE_COUNT;
+    std::string lora_model_dir;
+    std::string output_path = "output.png";
+    std::string input_path;
+
+    std::string prompt;
+    std::string negative_prompt;
+    float cfg_scale = 7.0f;
+    int width       = 512;
+    int height      = 512;
+    int batch_count = 1;
+
+    SampleMethod sample_method = EULER_A;
+    Schedule schedule          = DEFAULT;
+    int sample_steps           = 20;
+    float strength             = 0.75f;
+    RNGType rng_type           = CUDA_RNG;
+    int64_t seed               = 42;
+    bool verbose               = false;
+};
+
+void print_params(SDParams params) {
+    printf("Option: \n");
+    printf("    n_threads:         %d\n", params.n_threads);
+    printf("    mode:              %s\n", modes_str[params.mode]);
+    printf("    model_path:        %s\n", params.model_path.c_str());
+    printf("    wtype:             %s\n", params.wtype < GGML_TYPE_COUNT ? ggml_type_name(params.wtype) : "unspecified");
+    printf("    vae_path:          %s\n", params.vae_path.c_str());
+    printf("    output_path:       %s\n", params.output_path.c_str());
+    printf("    init_img:          %s\n", params.input_path.c_str());
+    printf("    prompt:            %s\n", params.prompt.c_str());
+    printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
+    printf("    cfg_scale:         %.2f\n", params.cfg_scale);
+    printf("    width:             %d\n", params.width);
+    printf("    height:            %d\n", params.height);
+    printf("    sample_method:     %s\n", sample_method_str[params.sample_method]);
+    printf("    schedule:          %s\n", schedule_str[params.schedule]);
+    printf("    sample_steps:      %d\n", params.sample_steps);
+    printf("    strength(img2img): %.2f\n", params.strength);
+    printf("    rng:               %s\n", rng_type_to_str[params.rng_type]);
+    printf("    seed:              %ld\n", params.seed);
+    printf("    batch_count:       %d\n", params.batch_count);
+}
+
+void print_usage(int argc, const char* argv[]) {
+    printf("usage: %s [arguments]\n", argv[0]);
+    printf("\n");
+    printf("arguments:\n");
+    printf("  -h, --help                         show this help message and exit\n");
+    printf("  -M, --mode [txt2img or img2img]    generation mode (default: txt2img)\n");
+    printf("  -t, --threads N                    number of threads to use during computation (default: -1).\n");
+    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
+    printf("  -m, --model [MODEL]                path to model\n");
+    printf("  --vae [VAE]                        path to vae\n");
+    printf("  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
+    printf("                                     If not specified, the default is the type of the weight file.");
+    printf("  --lora-model-dir [DIR]             lora model directory\n");
+    printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
+    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
+    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
+    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
+    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
+    printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
+    printf("                                     1.0 corresponds to full destruction of information in init image\n");
+    printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
+    printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
+    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
+    printf("                                     sampling method (default: \"euler_a\")\n");
+    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
+    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
+    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
+    printf("  -b, --batch-count COUNT            number of images to generate.\n");
+    printf("  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)\n");
+    printf("  -v, --verbose                      print extra info\n");
+}
+
+void parse_args(int argc, const char** argv, SDParams& params) {
+    bool invalid_arg = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-M" || arg == "--mode") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* mode_selected = argv[i];
+            int mode_found            = -1;
+            for (int d = 0; d < MODE_COUNT; d++) {
+                if (!strcmp(mode_selected, modes_str[d])) {
+                    mode_found = d;
+                }
+            }
+            if (mode_found == -1) {
+                fprintf(stderr, "error: invalid mode %s, must be one of [txt2img, img2img]\n",
+                        mode_selected);
+                exit(1);
+            }
+            params.mode = (SDMode)mode_found;
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.model_path = argv[i];
+        } else if (arg == "--vae") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.vae_path = argv[i];
+        } else if (arg == "--type") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            std::string type = argv[i];
+            if (type == "f32") {
+                params.wtype = GGML_TYPE_F32;
+            } else if (type == "f16") {
+                params.wtype = GGML_TYPE_F16;
+            } else if (type == "q4_0") {
+                params.wtype = GGML_TYPE_Q4_0;
+            } else if (type == "q4_1") {
+                params.wtype = GGML_TYPE_Q4_1;
+            } else if (type == "q5_0") {
+                params.wtype = GGML_TYPE_Q5_0;
+            } else if (type == "q5_1") {
+                params.wtype = GGML_TYPE_Q5_1;
+            } else if (type == "q8_0") {
+                params.wtype = GGML_TYPE_Q8_0;
+            } else {
+                fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
+                        type.c_str());
+                exit(1);
+            }
+        } else if (arg == "--lora-model-dir") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.lora_model_dir = argv[i];
+        } else if (arg == "-i" || arg == "--init-img") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.input_path = argv[i];
+        } else if (arg == "-o" || arg == "--output") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.output_path = argv[i];
+        } else if (arg == "-p" || arg == "--prompt") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.prompt = argv[i];
+        } else if (arg == "-n" || arg == "--negative-prompt") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.negative_prompt = argv[i];
+        } else if (arg == "--cfg-scale") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.cfg_scale = std::stof(argv[i]);
+        } else if (arg == "--strength") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.strength = std::stof(argv[i]);
+        } else if (arg == "-H" || arg == "--height") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.height = std::stoi(argv[i]);
+        } else if (arg == "-W" || arg == "--width") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.width = std::stoi(argv[i]);
+        } else if (arg == "--steps") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.sample_steps = std::stoi(argv[i]);
+        } else if (arg == "-b" || arg == "--batch-count") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.batch_count = std::stoi(argv[i]);
+        } else if (arg == "--rng") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            std::string rng_type_str = argv[i];
+            if (rng_type_str == "std_default") {
+                params.rng_type = STD_DEFAULT_RNG;
+            } else if (rng_type_str == "cuda") {
+                params.rng_type = CUDA_RNG;
+            } else {
+                invalid_arg = true;
+                break;
+            }
+        } else if (arg == "--schedule") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* schedule_selected = argv[i];
+            int schedule_found            = -1;
+            for (int d = 0; d < N_SCHEDULES; d++) {
+                if (!strcmp(schedule_selected, schedule_str[d])) {
+                    schedule_found = d;
+                }
+            }
+            if (schedule_found == -1) {
+                invalid_arg = true;
+                break;
+            }
+            params.schedule = (Schedule)schedule_found;
+        } else if (arg == "-s" || arg == "--seed") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.seed = std::stoll(argv[i]);
+        } else if (arg == "--sampling-method") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            const char* sample_method_selected = argv[i];
+            int sample_method_found            = -1;
+            for (int m = 0; m < N_SAMPLE_METHODS; m++) {
+                if (!strcmp(sample_method_selected, sample_method_str[m])) {
+                    sample_method_found = m;
+                }
+            }
+            if (sample_method_found == -1) {
+                invalid_arg = true;
+                break;
+            }
+            params.sample_method = (SampleMethod)sample_method_found;
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-v" || arg == "--verbose") {
+            params.verbose = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv);
+            exit(1);
+        }
+    }
+    if (invalid_arg) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv);
+        exit(1);
+    }
+    if (params.n_threads <= 0) {
+        params.n_threads = get_num_physical_cores();
+    }
+
+    if (params.prompt.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: prompt\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.model_path.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: model_path\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.mode == IMG2IMG && params.input_path.length() == 0) {
+        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.output_path.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: output_path\n");
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    if (params.width <= 0 || params.width % 64 != 0) {
+        fprintf(stderr, "error: the width must be a multiple of 64\n");
+        exit(1);
+    }
+
+    if (params.height <= 0 || params.height % 64 != 0) {
+        fprintf(stderr, "error: the height must be a multiple of 64\n");
+        exit(1);
+    }
+
+    if (params.sample_steps <= 0) {
+        fprintf(stderr, "error: the sample_steps must be greater than 0\n");
+        exit(1);
+    }
+
+    if (params.strength < 0.f || params.strength > 1.f) {
+        fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
+        exit(1);
+    }
+
+    if (params.seed < 0) {
+        srand((int)time(NULL));
+        params.seed = rand();
+    }
+}
+
+std::string get_image_params(SDParams params, int64_t seed) {
+    std::string parameter_string = params.prompt + "\n";
+    if (params.negative_prompt.size() != 0) {
+        parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
+    }
+    parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
+    parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
+    parameter_string += "Seed: " + std::to_string(seed) + ", ";
+    parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
+    parameter_string += "Model: " + basename(params.model_path) + ", ";
+    parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
+    parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
+    if (params.schedule == KARRAS) {
+        parameter_string += " karras";
+    }
+    parameter_string += ", ";
+    parameter_string += "Version: stable-diffusion.cpp";
+    return parameter_string;
+}
+
 int main(int argc, const char* argv[]) {
    SDParams params;
    parse_args(argc, argv, params);
@@ -50,7 +450,7 @@ int main(int argc, const char* argv[]) {
    }

    StableDiffusion sd(params.n_threads, vae_decode_only, true, params.lora_model_dir, params.rng_type);
-    if (!sd.load_from_file(params.model_path, params.schedule)) {
+    if (!sd.load_from_file(params.model_path, params.vae_path, params.wtype, params.schedule)) {
        return 1;
    }

@@ -79,7 +479,7 @@ int main(int argc, const char* argv[]) {
    }

    if (results.size() == 0 || results.size() != params.batch_count) {
-        fprintf(stderr, "generate failed\n");
+        LOG_ERROR("generate failed");
        return 1;
    }

@@ -88,7 +488,7 @@ int main(int argc, const char* argv[]) {
    for (int i = 0; i < params.batch_count; i++) {
        std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
        stbi_write_png(final_image_path.c_str(), params.width, params.height, 3, results[i], 0, get_image_params(params, params.seed + i).c_str());
-        printf("save result image to '%s'\n", final_image_path.c_str());
+        LOG_INFO("save result image to '%s'", final_image_path.c_str());
    }

    return 0;
--- a/examples/convert/CMakeLists.txt
+++ b/examples/convert/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET convert)
-
-add_executable(${TARGET} convert.cpp vocab.hpp)
-target_link_libraries(${TARGET} PRIVATE ggml zip ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
--- a/examples/convert/README.md
+++ b/examples/convert/README.md
@@ -1,16 +0,0 @@
-# Model Convert
-
-## Usage
-```
-usage: convert.exe [MODEL_PATH] --type [OUT_TYPE] [arguments]
-Model supported for conversion: .safetensors models or .ckpt checkpoints models
-
-arguments:
-  -h, --help                         show this help message and exit
-  -o, --out [FILENAME]               path or name to converted model
-  --vocab [FILENAME]                 path to custom vocab.json (usually unnecessary)
-  -v, --verbose                      print processing info - dev info
-  -l, --lora                         force read the model as a LoRA
-  --vae [FILENAME]                   merge a custom VAE
-  -t, --type [OUT_TYPE]              output format (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
-```
--- a/examples/convert/convert.cpp
+++ b/examples/convert/convert.cpp
--- a/format-code.sh
+++ b/format-code.sh
@@ -0,0 +1,2 @@
+clang-format -style=file -i *.cpp *.h
+clang-format -style=file -i examples/cli/*.cpp
--- a/model.cpp
+++ b/model.cpp
--- a/model.h
+++ b/model.h
@@ -0,0 +1,142 @@
+#ifndef __MODEL_H__
+#define __MODEL_H__
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "ggml/ggml.h"
+#include "json.hpp"
+#include "zip.h"
+
+enum SDVersion {
+    VERSION_1_x,
+    VERSION_2_x,
+    VERSION_XL,
+    VERSION_COUNT,
+};
+
+struct TensorStorage {
+    std::string name;
+    ggml_type type = GGML_TYPE_F32;
+    bool is_bf16   = false;
+    int64_t ne[4]  = {1, 1, 1, 1};
+    int n_dims     = 0;
+
+    size_t file_index = 0;
+    int index_in_zip  = -1;  // >= means stored in a zip file
+    size_t offset     = 0;   // offset in file
+
+    TensorStorage() = default;
+
+    TensorStorage(const std::string& name, ggml_type type, int64_t* ne, int n_dims, size_t file_index, size_t offset = 0)
+        : name(name), type(type), n_dims(n_dims), file_index(file_index), offset(offset) {
+        for (int i = 0; i < n_dims; i++) {
+            this->ne[i] = ne[i];
+        }
+    }
+
+    int64_t nelements() const {
+        return ne[0] * ne[1] * ne[2] * ne[3];
+    }
+
+    int64_t nbytes() const {
+        return nelements() * ggml_type_size(type) / ggml_blck_size(type);
+    }
+
+    int64_t nbytes_to_read() const {
+        if (is_bf16) {
+            return nbytes() / 2;
+        } else {
+            return nbytes();
+        }
+    }
+
+    void unsqueeze() {
+        if (n_dims == 2) {
+            n_dims = 4;
+            ne[3]  = ne[1];
+            ne[2]  = ne[0];
+            ne[1]  = 1;
+            ne[0]  = 1;
+        }
+    }
+
+    std::vector<TensorStorage> chunk(size_t n) {
+        std::vector<TensorStorage> chunks;
+        size_t chunk_size = nbytes_to_read() / n;
+        reverse_ne();
+        for (int i = 0; i < n; i++) {
+            TensorStorage chunk_i = *this;
+            chunk_i.ne[0]         = ne[0] / n;
+            chunk_i.offset        = offset + i * chunk_size;
+            chunk_i.reverse_ne();
+            chunks.push_back(chunk_i);
+        }
+        reverse_ne();
+        return chunks;
+    }
+
+    void reverse_ne() {
+        int64_t new_ne[4] = {1, 1, 1, 1};
+        for (int i = 0; i < n_dims; i++) {
+            new_ne[i] = ne[n_dims - 1 - i];
+        }
+        for (int i = 0; i < n_dims; i++) {
+            ne[i] = new_ne[i];
+        }
+    }
+};
+
+typedef std::function<bool(const TensorStorage&, ggml_tensor**)> on_new_tensor_cb_t;
+typedef std::function<void(const std::string&, int32_t)> on_new_token_cb_t;
+
+class ModelLoader {
+protected:
+    std::vector<std::string> file_paths_;
+    std::vector<TensorStorage> tensor_storages;
+
+public:
+    virtual bool init_from_file(const std::string& file_path, const std::string& prefix = "");
+    virtual bool init_from_files(const std::vector<std::string>& file_paths);
+    virtual SDVersion get_sd_version();
+    virtual ggml_type get_sd_wtype();
+    virtual bool load_vocab(on_new_token_cb_t on_new_token_cb);
+    virtual bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb);
+    virtual int64_t cal_mem_size();
+    virtual ~ModelLoader() = default;
+};
+
+class GGUFModelLoader : public ModelLoader {
+public:
+    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
+};
+
+class SafeTensorsModelLoader : public ModelLoader {
+public:
+    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
+};
+
+class CkptModelLoader : public ModelLoader {
+private:
+    bool parse_data_pkl(uint8_t* buffer,
+                        size_t buffer_size,
+                        zip_t* zip,
+                        std::string dir,
+                        size_t file_index,
+                        const std::string& prefix);
+
+public:
+    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
+};
+
+class DiffusersModelLoader : public SafeTensorsModelLoader {
+public:
+    bool init_from_file(const std::string& file_path, const std::string& prefix = "");
+};
+
+ModelLoader* init_model_loader_from_file(const std::string& file_path);
+
+#endif  // __MODEL_H__
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1,5 +1,6 @@
 #include <assert.h>
 #include <inttypes.h>
+#include <stdarg.h>
 #include <algorithm>
 #include <cstring>
 #include <fstream>
@@ -22,61 +23,24 @@
 #include "ggml-cuda.h"
 #endif

+#include "model.h"
 #include "rng.h"
 #include "rng_philox.h"
 #include "stable-diffusion.h"
+#include "util.h"

 #define EPS 1e-05f

-static SDLogLevel log_level = SDLogLevel::INFO;
-
 #define UNET_GRAPH_SIZE 3328
 #define LORA_GRAPH_SIZE 4096

-#define __FILENAME__ "stable-diffusion.cpp"
-#define SD_LOG(level, format, ...)                                                                    \
-    do {                                                                                              \
-        if (level < log_level) {                                                                      \
-            break;                                                                                    \
-        }                                                                                             \
-        if (level == SDLogLevel::DEBUG) {                                                             \
-            printf("[DEBUG] %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__);          \
-            fflush(stdout);                                                                           \
-        } else if (level == SDLogLevel::INFO) {                                                       \
-            printf("[INFO]  %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__);          \
-            fflush(stdout);                                                                           \
-        } else if (level == SDLogLevel::WARN) {                                                       \
-            fprintf(stderr, "[WARN]  %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__); \
-            fflush(stdout);                                                                           \
-        } else if (level == SDLogLevel::ERROR) {                                                      \
-            fprintf(stderr, "[ERROR] %s:%-4d - " format "\n", __FILENAME__, __LINE__, ##__VA_ARGS__); \
-            fflush(stdout);                                                                           \
-        }                                                                                             \
-    } while (0)
-
-#define LOG_DEBUG(format, ...) SD_LOG(SDLogLevel::DEBUG, format, ##__VA_ARGS__)
-#define LOG_INFO(format, ...) SD_LOG(SDLogLevel::INFO, format, ##__VA_ARGS__)
-#define LOG_WARN(format, ...) SD_LOG(SDLogLevel::WARN, format, ##__VA_ARGS__)
-#define LOG_ERROR(format, ...) SD_LOG(SDLogLevel::ERROR, format, ##__VA_ARGS__)
-
 #define TIMESTEPS 1000

-enum SDVersion {
-    VERSION_1_x,
-    VERSION_2_x,
-    VERSION_XL,
-    VERSION_COUNT,
-};
-
 const char* model_version_to_str[] = {
    "1.x",
    "2.x",
-    "XL"};
-
-const char* lora_type_to_str[] = {
-    "regular",
-    "diffusers",
-    "transformers"};
+    "XL",
+};

 const char* sampling_methods_str[] = {
    "Euler A",
@@ -86,14 +50,11 @@ const char* sampling_methods_str[] = {
    "DPM++ (2s)",
    "DPM++ (2M)",
    "modified DPM++ (2M)",
-    "LCM"};
+    "LCM",
+};

 /*================================================== Helper Functions ================================================*/

-void set_sd_log_level(SDLogLevel level) {
-    log_level = level;
-}
-
 std::string sd_get_system_info() {
    std::stringstream ss;
    ss << "System Info: \n";
@@ -188,7 +149,7 @@ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false) {
    if (shape_only) {
        return;
    }
-    int range = 3;
+    int range = 1000;
    for (int i = 0; i < tensor->ne[3]; i++) {
        if (i >= range && i + range < tensor->ne[3]) {
            continue;
@@ -277,15 +238,46 @@ void sd_fread(void* ptr, size_t size, size_t count, FILE* stream) {
    }
 }

-void copy_ggml_tensor(
-    struct ggml_tensor* dst,
-    const struct ggml_tensor* src) {
-    dst->nb[0] = src->nb[0];
-    dst->nb[1] = src->nb[1];
-    dst->nb[2] = src->nb[2];
-    dst->nb[3] = src->nb[3];
+void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_tensor* src) {
+    if (dst->type == src->type) {
+        dst->nb[0] = src->nb[0];
+        dst->nb[1] = src->nb[1];
+        dst->nb[2] = src->nb[2];
+        dst->nb[3] = src->nb[3];

-    memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst));
+        memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst));
+        return;
+    }
+    struct ggml_init_params params;
+    params.mem_size          = 10 * 1024 * 1024;  // for padding
+    params.mem_buffer        = NULL;
+    params.no_alloc          = false;
+    struct ggml_context* ctx = ggml_init(params);
+    if (!ctx) {
+        LOG_ERROR("ggml_init() failed");
+        return;
+    }
+    ggml_tensor* final = ggml_cpy_inplace(ctx, src, dst);
+
+    struct ggml_cgraph* graph = ggml_new_graph(ctx);
+    ggml_build_forward_expand(graph, final);
+    ggml_graph_compute_with_ctx(ctx, graph, 1);
+    ggml_free(ctx);
+}
+
+void calculate_alphas_cumprod(float* alphas_cumprod,
+                              float linear_start = 0.00085f,
+                              float linear_end   = 0.0120,
+                              int timesteps      = TIMESTEPS) {
+    float ls_sqrt = sqrtf(linear_start);
+    float le_sqrt = sqrtf(linear_end);
+    float amount  = le_sqrt - ls_sqrt;
+    float product = 1.0f;
+    for (int i = 0; i < timesteps; i++) {
+        float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
+        product *= 1.0f - powf(beta, 2.0f);
+        alphas_cumprod[i] = product;
+    }
 }

 // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
@@ -396,22 +388,6 @@ std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remov
    return std::make_pair(filename2multiplier, text);
 }

-bool ends_with(const std::string& str, const std::string& ending) {
-    if (str.length() >= ending.length()) {
-        return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
-    } else {
-        return false;
-    }
-}
-
-void replace_all_chars(std::string& str, char target, char replacement) {
-    for (size_t i = 0; i < str.length(); ++i) {
-        if (str[i] == target) {
-            str[i] = replacement;
-        }
-    }
-}
-
 /*================================================== CLIPTokenizer ===================================================*/

 const std::string UNK_TOKEN = "<|endoftext|>";
@@ -3244,7 +3220,7 @@ struct AutoEncoderKL {
        struct ggml_cgraph* gf = build_graph(x, decode);

        // compute the required memory
-        size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(compute_alloc, gf);
+        size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(compute_alloc, gf) + 10 * 1024 * 1024;

        // recreate the allocator with the required memory
        ggml_allocr_free(compute_alloc);
@@ -3281,9 +3257,21 @@ struct AutoEncoderKL {
    }
 };

+float ggml_backend_tensor_get_f32(ggml_tensor* tensor) {
+    GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16);
+    float value;
+    if (tensor->type == GGML_TYPE_F32) {
+        ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
+    } else {  // GGML_TYPE_F16
+        ggml_fp16_t f16_value;
+        ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
+        value = ggml_fp16_to_fp32(f16_value);
+    }
+    return value;
+}
+
 struct LoraModel {
-    float strength = 1.0f;
-    std::map<std::string, float> lora_alphas;
+    float multiplier = 1.0f;
    std::map<std::string, struct ggml_tensor*> lora_tensors;

    struct ggml_context* ctx;
@@ -3293,37 +3281,15 @@ struct LoraModel {
    bool load(ggml_backend_t backend_, std::string file_path) {
        backend = backend_;
        LOG_INFO("loading LoRA from '%s'", file_path.c_str());
-        ggml_context* ctx_meta = NULL;
-        gguf_context* ctx_gguf = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta});
+        std::shared_ptr<ModelLoader> model_loader = std::shared_ptr<ModelLoader>(init_model_loader_from_file(file_path));

-        if (!ctx_gguf) {
-            LOG_ERROR("failed to open '%s'", file_path.c_str());
+        if (!model_loader) {
+            LOG_ERROR("init lora model loader from file failed: '%s'", file_path.c_str());
            return false;
        }

-        FILE* fp = std::fopen(file_path.c_str(), "rb");
-
-        SDVersion version = VERSION_COUNT;
-
-        int n_kv      = gguf_get_n_kv(ctx_gguf);
-        int n_tensors = gguf_get_n_tensors(ctx_gguf);
-
-        for (int i = 0; i < n_kv; i++) {
-            const char* name          = gguf_get_key(ctx_gguf, i);
-            const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
-            LOG_DEBUG("%s: - kv %3d: %42s %-8s", __func__, i, name, gguf_type_name(type));
-        }
-
-        {
-            int nidx = gguf_find_key(ctx_gguf, "sd.lora.name");
-            int tidx = gguf_find_key(ctx_gguf, "sd.lora.type");
-            if (tidx >= 0 && nidx >= 0) {
-                LOG_INFO("LoRA Type: %s | %s", lora_type_to_str[gguf_get_val_i32(ctx_gguf, tidx) - 1], gguf_get_val_str(ctx_gguf, nidx));
-            }
-        }
-
        struct ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(n_tensors * ggml_tensor_overhead());
+        params.mem_size   = static_cast<size_t>(1024 * ggml_tensor_overhead());
        params.mem_buffer = NULL;
        params.no_alloc   = true;

@@ -3333,82 +3299,28 @@ struct LoraModel {
            return false;
        }

-        ggml_type wtype = GGML_TYPE_COUNT;
-        {
-            int idx = gguf_find_key(ctx_gguf, "sd.lora.dtype");
-            if (idx >= 0) {
-                wtype = (ggml_type)gguf_get_val_i32(ctx_gguf, idx);
-                LOG_INFO("LoRA data type: %s", ggml_type_name(wtype));
-            }
-        }
+        ggml_type wtype = model_loader->get_sd_wtype();

        LOG_DEBUG("calculating buffer size");
-        int memory_buffer_size = 0;
-
-        for (int i = 0; i < n_tensors; i++) {
-            std::string name          = gguf_get_tensor_name(ctx_gguf, i);
-            struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta, name.c_str());
-            memory_buffer_size += (int)ggml_nbytes(dummy);
-        }
+        int64_t memory_buffer_size = model_loader->cal_mem_size();
        LOG_DEBUG("lora params backend buffer size = % 6.2f MB", memory_buffer_size / (1024.0 * 1024.0));

        params_buffer_lora = ggml_backend_alloc_buffer(backend, memory_buffer_size);
-
-        LOG_DEBUG("loading alphas");
-        {
-            int kidx     = gguf_find_key(ctx_gguf, "sd.lora.alphas_k");
-            int vidx     = gguf_find_key(ctx_gguf, "sd.lora.alphas_v");
-            int n_alphas = gguf_get_arr_n(ctx_gguf, kidx);
-            if (n_alphas * 2 != n_tensors) {
-                LOG_ERROR("lora alphas expected: %i, got %i", n_tensors, n_alphas * 2);
-                return false;
-            }
-            float* alphas_values = (float*)gguf_get_arr_data(ctx_gguf, vidx);
-            for (int i = 0; i < n_alphas; i++) {
-                std::string alpha_name  = gguf_get_arr_str(ctx_gguf, kidx, i);
-                lora_alphas[alpha_name] = alphas_values[i];
-            }
-        }
-
        ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer_lora);

-        size_t data_offset = gguf_get_data_offset(ctx_gguf);
-        std::vector<char> read_buf;
-        for (int i = 0; i < n_tensors; i++) {
-            std::string name          = gguf_get_tensor_name(ctx_gguf, i);
-            struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta, name.c_str());
-            size_t offset             = data_offset + gguf_get_tensor_offset(ctx_gguf, i);
+        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+            const std::string& name = tensor_storage.name;

-#ifdef _WIN32
-            int ret = _fseeki64(fp, (__int64)offset, SEEK_SET);
-#else
-            int ret = std::fseek(fp, (long)offset, SEEK_SET);
-#endif
-            if (ret == -1) {
-                return false;
-            }
-
-            struct ggml_tensor* real = ggml_dup_tensor(ctx, dummy);
+            struct ggml_tensor* real = ggml_new_tensor(ctx, tensor_storage.type, tensor_storage.n_dims, tensor_storage.ne);
            ggml_allocr_alloc(alloc, real);

-            int num_bytes = (int)ggml_nbytes(dummy);
-
-            if (ggml_backend_is_cpu(backend)) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                sd_fread(real->data, 1, num_bytes, fp);
-            } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(num_bytes);
-                sd_fread(read_buf.data(), 1, num_bytes, fp);
-                ggml_backend_tensor_set(real, read_buf.data(), 0, num_bytes);
-            }
+            *dst_tensor = real;

            lora_tensors[name] = real;
-        }
-        read_buf.clear();
-        std::fclose(fp);
-        gguf_free(ctx_gguf);
-        ggml_free(ctx_meta);
+            return true;
+        };
+
+        model_loader->load_tensors(on_new_tensor_cb);

        LOG_DEBUG("finished loaded lora");
        ggml_allocr_free(alloc);
@@ -3428,54 +3340,94 @@ struct LoraModel {
        };

        struct ggml_context* ctx0 = ggml_init(params);
+        struct ggml_cgraph* gf    = ggml_new_graph_custom(ctx0, LORA_GRAPH_SIZE, false);

-        struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, LORA_GRAPH_SIZE, false);
+        std::set<std::string> applied_lora_tensors;
        for (auto it : model_tensors) {
-            std::string k_tensor = it.first;
+            std::string k_tensor       = it.first;
+            struct ggml_tensor* weight = model_tensors[it.first];

            size_t k_pos = k_tensor.find(".weight");
            if (k_pos == std::string::npos) {
                continue;
            }
-            k_tensor                    = k_tensor.substr(0, k_pos);
-            std::string lora_up_name    = "lora." + k_tensor + ".lora_up.weight";
-            std::string lora_down_name  = "lora." + k_tensor + ".lora_down.weight";
-            std::string lora_alpha_name = "lora." + k_tensor + ".alpha";
-            if (
-                lora_tensors.find(lora_up_name) != lora_tensors.end() &&
-                lora_tensors.find(lora_down_name) != lora_tensors.end() &&
-                lora_alphas.find(lora_alpha_name) != lora_alphas.end()) {
-                struct ggml_tensor* loraA  = lora_tensors[lora_up_name];
-                struct ggml_tensor* loraB  = lora_tensors[lora_down_name];
-                struct ggml_tensor* weight = model_tensors[it.first];
+            k_tensor = k_tensor.substr(0, k_pos);
+            replace_all_chars(k_tensor, '.', '_');
+            std::string lora_up_name   = "lora." + k_tensor + ".lora_up.weight";
+            std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
+            std::string alpha_name     = "lora." + k_tensor + ".alpha";
+            std::string scale_name     = "lora." + k_tensor + ".scale";

-                float scale = strength;
-                scale *= (lora_alphas[lora_alpha_name] / loraB->ne[loraB->n_dims - 1]);
-                ggml_tensor* lora_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+            ggml_tensor* lora_up   = NULL;
+            ggml_tensor* lora_down = NULL;

-                ggml_allocr_alloc(compute_alloc, lora_scale);
-                if (!ggml_allocr_is_measure(compute_alloc)) {
-                    ggml_backend_tensor_set(lora_scale, &scale, 0, ggml_nbytes(lora_scale));
-                }
+            if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
+                lora_up = lora_tensors[lora_up_name];
+            }

-                // flat lora tensors to multiply it
-                int64_t loraA_rows = loraA->ne[loraA->n_dims - 1];
-                loraA              = ggml_reshape_2d(ctx0, loraA, ggml_nelements(loraA) / loraA_rows, loraA_rows);
-                int64_t loraB_rows = loraB->ne[loraB->n_dims - 1];
-                loraB              = ggml_reshape_2d(ctx0, loraB, ggml_nelements(loraB) / loraB_rows, loraB_rows);
+            if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
+                lora_down = lora_tensors[lora_down_name];
+            }

-                // ggml_mul_mat requires tensor b transposed
-                loraB                      = ggml_cont(ctx0, ggml_transpose(ctx0, loraB));
-                struct ggml_tensor* loraBA = ggml_mul_mat(ctx0, loraA, loraB);
-                loraBA                     = ggml_cont(ctx0, ggml_transpose(ctx0, loraBA));
-                loraBA                     = ggml_reshape(ctx0, loraBA, weight);
-                GGML_ASSERT(ggml_nelements(loraBA) == ggml_nelements(weight));
-                loraBA = ggml_scale_inplace(ctx0, loraBA, lora_scale);
-                ggml_tensor* final_weight;
-                final_weight = ggml_add_inplace(ctx0, weight, loraBA);  // apply directly
-                ggml_build_forward_expand(gf, final_weight);
+            if (lora_up == NULL || lora_down == NULL) {
+                continue;
+            }
+
+            applied_lora_tensors.insert(lora_up_name);
+            applied_lora_tensors.insert(lora_down_name);
+            applied_lora_tensors.insert(alpha_name);
+            applied_lora_tensors.insert(scale_name);
+
+            // calc_cale
+            int64_t dim       = lora_down->ne[lora_down->n_dims - 1];
+            float scale_value = 1.0f;
+            if (lora_tensors.find(scale_name) != lora_tensors.end()) {
+                scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
+            } else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
+                float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
+                scale_value = alpha / dim;
+            }
+            scale_value *= multiplier;
+
+            ggml_tensor* lora_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+
+            ggml_allocr_alloc(compute_alloc, lora_scale);
+            if (!ggml_allocr_is_measure(compute_alloc)) {
+                ggml_backend_tensor_set(lora_scale, &scale_value, 0, ggml_nbytes(lora_scale));
+            }
+
+            // flat lora tensors to multiply it
+            int64_t lora_up_rows   = lora_up->ne[lora_up->n_dims - 1];
+            lora_up                = ggml_reshape_2d(ctx0, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
+            int64_t lora_down_rows = lora_down->ne[lora_down->n_dims - 1];
+            lora_down              = ggml_reshape_2d(ctx0, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
+
+            // ggml_mul_mat requires tensor b transposed
+            lora_down                  = ggml_cont(ctx0, ggml_transpose(ctx0, lora_down));
+            struct ggml_tensor* updown = ggml_mul_mat(ctx0, lora_up, lora_down);
+            updown                     = ggml_cont(ctx0, ggml_transpose(ctx0, updown));
+            updown                     = ggml_reshape(ctx0, updown, weight);
+            GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
+            updown = ggml_scale_inplace(ctx0, updown, lora_scale);
+            ggml_tensor* final_weight;
+            // if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
+            //     final_weight = ggml_new_tensor(ctx0, GGML_TYPE_F32, weight->n_dims, weight->ne);
+            //     final_weight = ggml_cpy_inplace(ctx0, weight, final_weight);
+            //     final_weight = ggml_add_inplace(ctx0, final_weight, updown);
+            //     final_weight = ggml_cpy_inplace(ctx0, final_weight, weight);
+            // } else {
+            //     final_weight = ggml_add_inplace(ctx0, weight, updown);
+            // }
+            final_weight = ggml_add_inplace(ctx0, weight, updown);  // apply directly
+            ggml_build_forward_expand(gf, final_weight);
+        }
+
+        for (auto& kv : lora_tensors) {
+            if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
+                LOG_WARN("unused lora tensor %s", kv.first.c_str());
            }
        }
+
        return gf;
    }

@@ -3683,11 +3635,7 @@ public:
        } else if (rng_type == CUDA_RNG) {
            rng = std::make_shared<PhiloxRNG>();
        }
-        if (lora_model_dir.size() > 0) {
-            if (lora_model_dir[lora_model_dir.size() - 1] != '/' && lora_model_dir[lora_model_dir.size() - 1] != '\\') {
-                this->lora_model_dir = lora_model_dir + "/";
-            }
-        }
+        this->lora_model_dir = lora_model_dir;
    }

    ~StableDiffusionGGML() {
@@ -3696,7 +3644,10 @@ public:
        first_stage_model.destroy();
    }

-    bool load_from_file(const std::string& file_path, Schedule schedule) {
+    bool load_from_file(const std::string& model_path,
+                        const std::string& vae_path,
+                        ggml_type wtype,
+                        Schedule schedule) {
 #ifdef SD_USE_CUBLAS
        LOG_DEBUG("Using CUDA backend");
        backend = ggml_backend_cuda_init();
@@ -3712,59 +3663,44 @@ public:
        LOG_INFO("Flash Attention enabled");
 #endif
 #endif
-        LOG_INFO("loading model from '%s'", file_path.c_str());
-        ggml_context* ctx_meta = NULL;
-        gguf_context* ctx_gguf = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta});
-        if (!ctx_gguf) {
-            LOG_ERROR("failed to open '%s'", file_path.c_str());
+        LOG_INFO("loading model from '%s'", model_path.c_str());
+        std::shared_ptr<ModelLoader> model_loader = std::shared_ptr<ModelLoader>(init_model_loader_from_file(model_path));
+
+        if (!model_loader) {
+            LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
            return false;
        }

-        FILE* fp = std::fopen(file_path.c_str(), "rb");
-
-        SDVersion version = VERSION_COUNT;
-
-        int n_kv      = gguf_get_n_kv(ctx_gguf);
-        int n_tensors = gguf_get_n_tensors(ctx_gguf);
-
-        for (int i = 0; i < n_kv; i++) {
-            const char* name          = gguf_get_key(ctx_gguf, i);
-            const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
-            LOG_DEBUG("%s: - kv %3d: %42s %-8s", __func__, i, name, gguf_type_name(type));
-        }
-
-        {
-            int nidx = gguf_find_key(ctx_gguf, "sd.model.name");
-            int vidx = gguf_find_key(ctx_gguf, "sd.model.version");
-            if (vidx >= 0 && nidx >= 0) {
-                version          = (SDVersion)gguf_get_val_i8(ctx_gguf, vidx);
-                cond_stage_model = FrozenCLIPEmbedderWithCustomWords(version);
-                diffusion_model  = UNetModel(version);
-                LOG_INFO("Stable Diffusion %s | %s", model_version_to_str[version], gguf_get_val_str(ctx_gguf, nidx));
+        if (vae_path.size() > 0) {
+            LOG_INFO("loading vae from '%s'", vae_path.c_str());
+            if (!model_loader->init_from_file(vae_path, "vae.")) {
+                LOG_WARN("loading vae from '%s' failed", vae_path.c_str());
            }
        }

-        {
-            int idx = gguf_find_key(ctx_gguf, "sd.model.dtype");
-            if (idx >= 0) {
-                model_data_type = (ggml_type)gguf_get_val_i32(ctx_gguf, idx);
-                LOG_INFO("model data type: %s", ggml_type_name(model_data_type));
-            }
+        SDVersion version = model_loader->get_sd_version();
+        if (version == VERSION_COUNT) {
+            LOG_ERROR("get sd version from file failed: '%s'", model_path.c_str());
+            return false;
        }
+        cond_stage_model = FrozenCLIPEmbedderWithCustomWords(version);
+        diffusion_model  = UNetModel(version);
+        LOG_INFO("Stable Diffusion %s ", model_version_to_str[version]);
+        if (wtype == GGML_TYPE_COUNT) {
+            model_data_type = model_loader->get_sd_wtype();
+        } else {
+            model_data_type = wtype;
+        }
+        LOG_INFO("Stable Diffusion weight type: %s", ggml_type_name(model_data_type));

        LOG_DEBUG("loading vocab");
-
-        // load vocab
-        {
-            int tidx = gguf_find_key(ctx_gguf, "sd.vocab.tokens");
-            if (tidx == -1) {
-                LOG_ERROR("vocab not found");
-                return false;
-            }
-            int n_vocab = gguf_get_arr_n(ctx_gguf, tidx);
-            for (int i = 0; i < n_vocab; i++) {
-                cond_stage_model.tokenizer.add_token(gguf_get_arr_str(ctx_gguf, tidx, i), i);
-            }
+        auto add_token = [&](const std::string& token, int32_t token_id) {
+            cond_stage_model.tokenizer.add_token(token, token_id);
+        };
+        bool success = model_loader->load_vocab(add_token);
+        if (!success) {
+            LOG_ERROR("get vocab from file failed: '%s'", model_path.c_str());
+            return false;
        }

        // create the ggml context for network params
@@ -3793,34 +3729,33 @@ public:
            first_stage_model.map_by_name(tensors, "first_stage_model.");
        }

-        std::set<std::string> tensor_names_in_file;
-        int64_t t0 = ggml_time_ms();
-        LOG_DEBUG("loading weights");
+        struct ggml_init_params params;
+        params.mem_size          = static_cast<size_t>(10 * 1024) * 1024;  // 10M
+        params.mem_buffer        = NULL;
+        params.no_alloc          = false;
+        struct ggml_context* ctx = ggml_init(params);  // for  alphas_cumprod and is_using_v_parameterization check
+        if (!ctx) {
+            LOG_ERROR("ggml_init() failed");
+            return false;
+        }
+        ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
+        calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);

        // load weights
-        float alphas_cumprod[TIMESTEPS];
+        LOG_DEBUG("loading weights");
+        std::set<std::string> tensor_names_in_file;
+        int64_t t0 = ggml_time_ms();

+        size_t total_size = 0;
        std::vector<char> read_buf;
-        size_t total_size  = 0;
-        size_t data_offset = gguf_get_data_offset(ctx_gguf);
-        for (int i = 0; i < n_tensors; i++) {
-            std::string name          = gguf_get_tensor_name(ctx_gguf, i);
-            struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta, name.c_str());
-            size_t offset             = data_offset + gguf_get_tensor_offset(ctx_gguf, i);
+
+        auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+            const std::string& name = tensor_storage.name;
            tensor_names_in_file.insert(name);

-#ifdef _WIN32
-            int ret = _fseeki64(fp, (__int64)offset, SEEK_SET);
-#else
-            int ret = std::fseek(fp, (long)offset, SEEK_SET);
-#endif
-            if (ret == -1) {
-                return false;
-            }
-
            if (name == "alphas_cumprod") {
-                sd_fread(alphas_cumprod, 1, ggml_nbytes(dummy), fp);
-                continue;
+                *dst_tensor = alphas_cumprod_tensor;
+                return true;
            }

            struct ggml_tensor* real;
@@ -3832,54 +3767,46 @@ public:
                } else {
                    if (!vae_decode_only) {
                        LOG_WARN("unknown tensor '%s' in model file", name.data());
-                        return false;
                    }
                }
-                continue;
+                return true;
            }

            if (
-                real->ne[0] != dummy->ne[0] ||
-                real->ne[1] != dummy->ne[1] ||
-                real->ne[2] != dummy->ne[2] ||
-                real->ne[3] != dummy->ne[3]) {
+                real->ne[0] != tensor_storage.ne[0] ||
+                real->ne[1] != tensor_storage.ne[1] ||
+                real->ne[2] != tensor_storage.ne[2] ||
+                real->ne[3] != tensor_storage.ne[3]) {
                LOG_ERROR(
                    "tensor '%s' has wrong shape in model file: "
                    "got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
                    name.c_str(),
-                    (int)dummy->ne[0], (int)dummy->ne[1], (int)dummy->ne[2], (int)dummy->ne[3],
+                    (int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
                    (int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]);
                return false;
            }

-            if (real->type != dummy->type) {
-                LOG_ERROR("tensor '%s' has wrong type in model file: got %s, expect %s",
-                          name.c_str(), ggml_type_name(dummy->type), ggml_type_name(real->type));
-                return false;
-            }
+            *dst_tensor = real;

-            int num_bytes = (int)ggml_nbytes(dummy);
+            total_size += ggml_nbytes(real);
+            return true;
+        };

-            if (ggml_backend_is_cpu(backend)) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                sd_fread(real->data, 1, num_bytes, fp);
-            } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(num_bytes);
-                sd_fread(read_buf.data(), 1, num_bytes, fp);
-                ggml_backend_tensor_set(real, read_buf.data(), 0, num_bytes);
-            }
+        // print_ggml_tensor(alphas_cumprod_tensor);

-            total_size += ggml_nbytes(dummy);
+        success = model_loader->load_tensors(on_new_tensor_cb);
+        if (!success) {
+            LOG_ERROR("load tensors from file failed");
+            ggml_free(ctx);
+            return false;
        }

-        gguf_free(ctx_gguf);
-        ggml_free(ctx_meta);
+        // print_ggml_tensor(alphas_cumprod_tensor);

-        std::fclose(fp);
-        read_buf.clear();
+        // calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);

        bool some_tensor_not_init = false;
+
        for (auto pair : tensors) {
            if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) {
                continue;
@@ -3891,12 +3818,8 @@ public:
            }
        }

-        if (tensor_names_in_file.find("alphas_cumprod") == tensor_names_in_file.end()) {
-            LOG_ERROR("tensor alphas_cumprod not in model file");
-            some_tensor_not_init = true;
-        }
-
        if (some_tensor_not_init) {
+            ggml_free(ctx);
            return false;
        }

@@ -3912,24 +3835,14 @@ public:
                 diffusion_model.memory_buffer_size / 1024.0 / 1024.0,
                 first_stage_model.memory_buffer_size / 1024.0 / 1024.0);
        int64_t t1 = ggml_time_ms();
-        LOG_INFO("loading model from '%s' completed, taking %.2fs", file_path.c_str(), (t1 - t0) * 1.0f / 1000);
+        LOG_INFO("loading model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);

        // check is_using_v_parameterization_for_sd2
        bool is_using_v_parameterization = false;
        if (version == VERSION_2_x) {
-            struct ggml_init_params params;
-            params.mem_size          = static_cast<size_t>(10 * 1024) * 1024;  // 10M
-            params.mem_buffer        = NULL;
-            params.no_alloc          = false;
-            struct ggml_context* ctx = ggml_init(params);
-            if (!ctx) {
-                LOG_ERROR("ggml_init() failed");
-                return false;
-            }
            if (is_using_v_parameterization_for_sd2(ctx)) {
                is_using_v_parameterization = true;
            }
-            ggml_free(ctx);
        }

        if (is_using_v_parameterization) {
@@ -3959,11 +3872,12 @@ public:
        }

        for (int i = 0; i < TIMESTEPS; i++) {
-            denoiser->schedule->alphas_cumprod[i] = alphas_cumprod[i];
+            denoiser->schedule->alphas_cumprod[i] = ((float*)alphas_cumprod_tensor->data)[i];
            denoiser->schedule->sigmas[i]         = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
            denoiser->schedule->log_sigmas[i]     = std::log(denoiser->schedule->sigmas[i]);
        }
        LOG_DEBUG("finished loaded file");
+        ggml_free(ctx);
        return true;
    }

@@ -4005,13 +3919,26 @@ public:
    void apply_lora(const std::string& lora_name, float multiplier) {
        int64_t t0 = ggml_time_ms();
        LoraModel lora;
-        std::string file_path = lora_model_dir + lora_name + ".gguf";
-        if (lora.load(backend, file_path)) {
-            lora.strength = multiplier;
-            lora.apply(tensors, n_threads);
-            loras[lora_name] = lora;
-            lora.release();
+        std::string st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
+        std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
+        std::string file_path;
+        if (file_exists(st_file_path)) {
+            file_path = st_file_path;
+        } else if (file_exists(ckpt_file_path)) {
+            file_path = ckpt_file_path;
+        } else {
+            LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
+            return;
        }
+        if (!lora.load(backend, file_path)) {
+            LOG_WARN("load lora tensors from %s failed", file_path.c_str());
+            return;
+        }
+
+        lora.multiplier = multiplier;
+        lora.apply(tensors, n_threads);
+        loras[lora_name] = lora;
+        lora.release();

        int64_t t1 = ggml_time_ms();

@@ -4621,8 +4548,11 @@ StableDiffusion::StableDiffusion(int n_threads,
                                               rng_type);
 }

-bool StableDiffusion::load_from_file(const std::string& file_path, Schedule s) {
-    return sd->load_from_file(file_path, s);
+bool StableDiffusion::load_from_file(const std::string& model_path,
+                                     const std::string& vae_path,
+                                     ggml_type wtype,
+                                     Schedule s) {
+    return sd->load_from_file(model_path, vae_path, wtype, s);
 }

 std::vector<uint8_t*> StableDiffusion::txt2img(std::string prompt,
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -5,13 +5,6 @@
 #include <memory>
 #include <vector>

-enum SDLogLevel {
-    DEBUG,
-    INFO,
-    WARN,
-    ERROR
-};
-
 enum RNGType {
    STD_DEFAULT_RNG,
    CUDA_RNG
@@ -48,7 +41,10 @@ public:
                    bool free_params_immediately = false,
                    std::string lora_model_dir   = "",
                    RNGType rng_type             = STD_DEFAULT_RNG);
-    bool load_from_file(const std::string& file_path, Schedule d = DEFAULT);
+    bool load_from_file(const std::string& model_path,
+                        const std::string& vae_path,
+                        ggml_type wtype,
+                        Schedule d = DEFAULT);
    std::vector<uint8_t*> txt2img(
        std::string prompt,
        std::string negative_prompt,
@@ -73,7 +69,6 @@ public:
        int64_t seed);
 };

-void set_sd_log_level(SDLogLevel level);
 std::string sd_get_system_info();

 #endif  // __STABLE_DIFFUSION_H__
--- a/thirdparty/CMakeLists.txt
+++ b/thirdparty/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(Z_TARGET zip)
+add_library(${Z_TARGET} OBJECT zip.c zip.h miniz.h)
+target_include_directories(${Z_TARGET} PUBLIC .)
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -0,0 +1,2 @@
+- json.hpp library from: https://github.com/nlohmann/json
+- ZIP Library from: https://github.com/kuba--/zip
--- a/thirdparty/json.hpp
+++ b/thirdparty/json.hpp
--- a/thirdparty/miniz.h
+++ b/thirdparty/miniz.h
--- a/thirdparty/stb_image.h
+++ b/thirdparty/stb_image.h
--- a/thirdparty/stb_image_write.h
+++ b/thirdparty/stb_image_write.h
--- a/thirdparty/zip.c
+++ b/thirdparty/zip.c
--- a/thirdparty/zip.h
+++ b/thirdparty/zip.h
@@ -0,0 +1,509 @@
+/*
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#ifndef ZIP_H
+#define ZIP_H
+
+#include <stdint.h>
+#include <string.h>
+#include <sys/types.h>
+
+#ifndef ZIP_SHARED
+#define ZIP_EXPORT
+#else
+#ifdef _WIN32
+#ifdef ZIP_BUILD_SHARED
+#define ZIP_EXPORT __declspec(dllexport)
+#else
+#define ZIP_EXPORT __declspec(dllimport)
+#endif
+#else
+#define ZIP_EXPORT __attribute__((visibility("default")))
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(_POSIX_C_SOURCE) && defined(_MSC_VER)
+// 64-bit Windows is the only mainstream platform
+// where sizeof(long) != sizeof(void*)
+#ifdef _WIN64
+typedef long long ssize_t; /* byte count or error */
+#else
+typedef long ssize_t; /* byte count or error */
+#endif
+#endif
+
+/**
+ * @mainpage
+ *
+ * Documentation for @ref zip.
+ */
+
+/**
+ * @addtogroup zip
+ * @{
+ */
+
+/**
+ * Default zip compression level.
+ */
+#define ZIP_DEFAULT_COMPRESSION_LEVEL 6
+
+/**
+ * Error codes
+ */
+#define ZIP_ENOINIT -1      // not initialized
+#define ZIP_EINVENTNAME -2  // invalid entry name
+#define ZIP_ENOENT -3       // entry not found
+#define ZIP_EINVMODE -4     // invalid zip mode
+#define ZIP_EINVLVL -5      // invalid compression level
+#define ZIP_ENOSUP64 -6     // no zip 64 support
+#define ZIP_EMEMSET -7      // memset error
+#define ZIP_EWRTENT -8      // cannot write data to entry
+#define ZIP_ETDEFLINIT -9   // cannot initialize tdefl compressor
+#define ZIP_EINVIDX -10     // invalid index
+#define ZIP_ENOHDR -11      // header not found
+#define ZIP_ETDEFLBUF -12   // cannot flush tdefl buffer
+#define ZIP_ECRTHDR -13     // cannot create entry header
+#define ZIP_EWRTHDR -14     // cannot write entry header
+#define ZIP_EWRTDIR -15     // cannot write to central dir
+#define ZIP_EOPNFILE -16    // cannot open file
+#define ZIP_EINVENTTYPE -17 // invalid entry type
+#define ZIP_EMEMNOALLOC -18 // extracting data using no memory allocation
+#define ZIP_ENOFILE -19     // file not found
+#define ZIP_ENOPERM -20     // no permission
+#define ZIP_EOOMEM -21      // out of memory
+#define ZIP_EINVZIPNAME -22 // invalid zip archive name
+#define ZIP_EMKDIR -23      // make dir error
+#define ZIP_ESYMLINK -24    // symlink error
+#define ZIP_ECLSZIP -25     // close archive error
+#define ZIP_ECAPSIZE -26    // capacity size too small
+#define ZIP_EFSEEK -27      // fseek error
+#define ZIP_EFREAD -28      // fread error
+#define ZIP_EFWRITE -29     // fwrite error
+#define ZIP_ERINIT -30      // cannot initialize reader
+#define ZIP_EWINIT -31      // cannot initialize writer
+#define ZIP_EWRINIT -32     // cannot initialize writer from reader
+
+/**
+ * Looks up the error message string corresponding to an error number.
+ * @param errnum error number
+ * @return error message string corresponding to errnum or NULL if error is not
+ * found.
+ */
+extern ZIP_EXPORT const char *zip_strerror(int errnum);
+
+/**
+ * @struct zip_t
+ *
+ * This data structure is used throughout the library to represent zip archive -
+ * forward declaration.
+ */
+struct zip_t;
+
+/**
+ * Opens zip archive with compression level using the given mode.
+ *
+ * @param zipname zip archive file name.
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *zip_open(const char *zipname, int level,
+                                         char mode);
+
+/**
+ * Opens zip archive with compression level using the given mode.
+ * The function additionally returns @param errnum -
+ *
+ * @param zipname zip archive file name.
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ * @param errnum 0 on success, negative number (< 0) on error.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *
+zip_openwitherror(const char *zipname, int level, char mode, int *errnum);
+
+/**
+ * Closes the zip archive, releases resources - always finalize.
+ *
+ * @param zip zip archive handler.
+ */
+extern ZIP_EXPORT void zip_close(struct zip_t *zip);
+
+/**
+ * Determines if the archive has a zip64 end of central directory headers.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - 1 (true), 0 (false), negative number (< 0) on
+ *         error.
+ */
+extern ZIP_EXPORT int zip_is64(struct zip_t *zip);
+
+/**
+ * Opens an entry by name in the zip archive.
+ *
+ * For zip archive opened in 'w' or 'a' mode the function will append
+ * a new entry. In readonly mode the function tries to locate the entry
+ * in global dictionary.
+ *
+ * @param zip zip archive handler.
+ * @param entryname an entry name in local dictionary.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_open(struct zip_t *zip, const char *entryname);
+
+/**
+ * Opens an entry by name in the zip archive.
+ *
+ * For zip archive opened in 'w' or 'a' mode the function will append
+ * a new entry. In readonly mode the function tries to locate the entry
+ * in global dictionary (case sensitive).
+ *
+ * @param zip zip archive handler.
+ * @param entryname an entry name in local dictionary (case sensitive).
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_opencasesensitive(struct zip_t *zip,
+                                                  const char *entryname);
+
+/**
+ * Opens a new entry by index in the zip archive.
+ *
+ * This function is only valid if zip archive was opened in 'r' (readonly) mode.
+ *
+ * @param zip zip archive handler.
+ * @param index index in local dictionary.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_openbyindex(struct zip_t *zip, size_t index);
+
+/**
+ * Closes a zip entry, flushes buffer and releases resources.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_close(struct zip_t *zip);
+
+/**
+ * Returns a local name of the current zip entry.
+ *
+ * The main difference between user's entry name and local entry name
+ * is optional relative path.
+ * Following .ZIP File Format Specification - the path stored MUST not contain
+ * a drive or device letter, or a leading slash.
+ * All slashes MUST be forward slashes '/' as opposed to backwards slashes '\'
+ * for compatibility with Amiga and UNIX file systems etc.
+ *
+ * @param zip: zip archive handler.
+ *
+ * @return the pointer to the current zip entry name, or NULL on error.
+ */
+extern ZIP_EXPORT const char *zip_entry_name(struct zip_t *zip);
+
+/**
+ * Returns an index of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the index on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entry_index(struct zip_t *zip);
+
+/**
+ * Determines if the current zip entry is a directory entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - 1 (true), 0 (false), negative number (< 0) on
+ *         error.
+ */
+extern ZIP_EXPORT int zip_entry_isdir(struct zip_t *zip);
+
+/**
+ * Returns the uncompressed size of the current zip entry.
+ * Alias for zip_entry_uncomp_size (for backward compatibility).
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the uncompressed size in bytes.
+ */
+extern ZIP_EXPORT unsigned long long zip_entry_size(struct zip_t *zip);
+
+/**
+ * Returns the uncompressed size of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the uncompressed size in bytes.
+ */
+extern ZIP_EXPORT unsigned long long zip_entry_uncomp_size(struct zip_t *zip);
+
+/**
+ * Returns the compressed size of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the compressed size in bytes.
+ */
+extern ZIP_EXPORT unsigned long long zip_entry_comp_size(struct zip_t *zip);
+
+/**
+ * Returns CRC-32 checksum of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the CRC-32 checksum.
+ */
+extern ZIP_EXPORT unsigned int zip_entry_crc32(struct zip_t *zip);
+
+/**
+ * Compresses an input buffer for the current zip entry.
+ *
+ * @param zip zip archive handler.
+ * @param buf input buffer.
+ * @param bufsize input buffer size (in bytes).
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_write(struct zip_t *zip, const void *buf,
+                                      size_t bufsize);
+
+/**
+ * Compresses a file for the current zip entry.
+ *
+ * @param zip zip archive handler.
+ * @param filename input file.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_fwrite(struct zip_t *zip, const char *filename);
+
+/**
+ * Extracts the current zip entry into output buffer.
+ *
+ * The function allocates sufficient memory for a output buffer.
+ *
+ * @param zip zip archive handler.
+ * @param buf output buffer.
+ * @param bufsize output buffer size (in bytes).
+ *
+ * @note remember to release memory allocated for a output buffer.
+ *       for large entries, please take a look at zip_entry_extract function.
+ *
+ * @return the return code - the number of bytes actually read on success.
+ *         Otherwise a negative number (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entry_read(struct zip_t *zip, void **buf,
+                                         size_t *bufsize);
+
+/**
+ * Extracts the current zip entry into a memory buffer using no memory
+ * allocation.
+ *
+ * @param zip zip archive handler.
+ * @param buf preallocated output buffer.
+ * @param bufsize output buffer size (in bytes).
+ *
+ * @note ensure supplied output buffer is large enough.
+ *       zip_entry_size function (returns uncompressed size for the current
+ *       entry) can be handy to estimate how big buffer is needed.
+ *       For large entries, please take a look at zip_entry_extract function.
+ *
+ * @return the return code - the number of bytes actually read on success.
+ *         Otherwise a negative number (< 0) on error (e.g. bufsize is not large
+ * enough).
+ */
+extern ZIP_EXPORT ssize_t zip_entry_noallocread(struct zip_t *zip, void *buf,
+                                                size_t bufsize);
+
+/**
+ * Extracts the current zip entry into output file.
+ *
+ * @param zip zip archive handler.
+ * @param filename output file.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_fread(struct zip_t *zip, const char *filename);
+
+/**
+ * Extracts the current zip entry using a callback function (on_extract).
+ *
+ * @param zip zip archive handler.
+ * @param on_extract callback function.
+ * @param arg opaque pointer (optional argument, which you can pass to the
+ *        on_extract callback)
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int
+zip_entry_extract(struct zip_t *zip,
+                  size_t (*on_extract)(void *arg, uint64_t offset,
+                                       const void *data, size_t size),
+                  void *arg);
+
+/**
+ * Returns the number of all entries (files and directories) in the zip archive.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - the number of entries on success, negative number
+ *         (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entries_total(struct zip_t *zip);
+
+/**
+ * Deletes zip archive entries.
+ *
+ * @param zip zip archive handler.
+ * @param entries array of zip archive entries to be deleted.
+ * @param len the number of entries to be deleted.
+ * @return the number of deleted entries, or negative number (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entries_delete(struct zip_t *zip,
+                                             char *const entries[], size_t len);
+
+/**
+ * Extracts a zip archive stream into directory.
+ *
+ * If on_extract is not NULL, the callback will be called after
+ * successfully extracted each zip entry.
+ * Returning a negative value from the callback will cause abort and return an
+ * error. The last argument (void *arg) is optional, which you can use to pass
+ * data to the on_extract callback.
+ *
+ * @param stream zip archive stream.
+ * @param size stream size.
+ * @param dir output directory.
+ * @param on_extract on extract callback.
+ * @param arg opaque pointer.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int
+zip_stream_extract(const char *stream, size_t size, const char *dir,
+                   int (*on_extract)(const char *filename, void *arg),
+                   void *arg);
+
+/**
+ * Opens zip archive stream into memory.
+ *
+ * @param stream zip archive stream.
+ * @param size stream size.
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *zip_stream_open(const char *stream, size_t size,
+                                                int level, char mode);
+
+/**
+ * Opens zip archive stream into memory.
+ * The function additionally returns @param errnum -
+ *
+ * @param stream zip archive stream.
+ * @param size stream size.*
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ * @param errnum 0 on success, negative number (< 0) on error.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *zip_stream_openwitherror(const char *stream,
+                                                         size_t size, int level,
+                                                         char mode,
+                                                         int *errnum);
+
+/**
+ * Copy zip archive stream output buffer.
+ *
+ * @param zip zip archive handler.
+ * @param buf output buffer. User should free buf.
+ * @param bufsize output buffer size (in bytes).
+ *
+ * @return copy size
+ */
+extern ZIP_EXPORT ssize_t zip_stream_copy(struct zip_t *zip, void **buf,
+                                          size_t *bufsize);
+
+/**
+ * Close zip archive releases resources.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return
+ */
+extern ZIP_EXPORT void zip_stream_close(struct zip_t *zip);
+
+/**
+ * Creates a new archive and puts files into a single zip archive.
+ *
+ * @param zipname zip archive file.
+ * @param filenames input files.
+ * @param len: number of input files.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_create(const char *zipname, const char *filenames[],
+                                 size_t len);
+
+/**
+ * Extracts a zip archive file into directory.
+ *
+ * If on_extract_entry is not NULL, the callback will be called after
+ * successfully extracted each zip entry.
+ * Returning a negative value from the callback will cause abort and return an
+ * error. The last argument (void *arg) is optional, which you can use to pass
+ * data to the on_extract_entry callback.
+ *
+ * @param zipname zip archive file.
+ * @param dir output directory.
+ * @param on_extract_entry on extract callback.
+ * @param arg opaque pointer.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_extract(const char *zipname, const char *dir,
+                                  int (*on_extract_entry)(const char *filename,
+                                                          void *arg),
+                                  void *arg);
+/** @} */
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/util.cpp
+++ b/util.cpp
@@ -0,0 +1,186 @@
+#include "util.h"
+
+#include <stdarg.h>
+#include <fstream>
+#include <thread>
+#include <unordered_set>
+#include <vector>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+
+#if !defined(_WIN32)
+#include <sys/ioctl.h>
+#include <unistd.h>
+#endif
+
+bool ends_with(const std::string& str, const std::string& ending) {
+    if (str.length() >= ending.length()) {
+        return (str.compare(str.length() - ending.length(), ending.length(), ending) == 0);
+    } else {
+        return false;
+    }
+}
+
+bool starts_with(const std::string& str, const std::string& start) {
+    if (str.find(start) == 0) {
+        return true;
+    }
+    return false;
+}
+
+void replace_all_chars(std::string& str, char target, char replacement) {
+    for (size_t i = 0; i < str.length(); ++i) {
+        if (str[i] == target) {
+            str[i] = replacement;
+        }
+    }
+}
+
+std::string format(const char* fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+#ifdef _WIN32  // code for windows
+#include <windows.h>
+
+bool file_exists(const std::string& filename) {
+    DWORD attributes = GetFileAttributesA(filename.c_str());
+    return (attributes != INVALID_FILE_ATTRIBUTES && !(attributes & FILE_ATTRIBUTE_DIRECTORY));
+}
+
+bool is_directory(const std::string& path) {
+    DWORD attributes = GetFileAttributesA(path.c_str());
+    return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
+}
+
+#else  // Unix
+#include <dirent.h>
+#include <sys/stat.h>
+
+bool file_exists(const std::string& filename) {
+    struct stat buffer;
+    return (stat(filename.c_str(), &buffer) == 0 && S_ISREG(buffer.st_mode));
+}
+
+bool is_directory(const std::string& path) {
+    struct stat buffer;
+    return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
+}
+
+#endif
+
+// get_num_physical_cores is copy from
+// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
+// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
+int32_t get_num_physical_cores() {
+#ifdef __linux__
+    // enumerate the set of thread siblings, num entries is num cores
+    std::unordered_set<std::string> siblings;
+    for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
+        std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
+        if (!thread_siblings.is_open()) {
+            break;  // no more cpus
+        }
+        std::string line;
+        if (std::getline(thread_siblings, line)) {
+            siblings.insert(line);
+        }
+    }
+    if (siblings.size() > 0) {
+        return static_cast<int32_t>(siblings.size());
+    }
+#elif defined(__APPLE__) && defined(__MACH__)
+    int32_t num_physical_cores;
+    size_t len = sizeof(num_physical_cores);
+    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+#elif defined(_WIN32)
+    // TODO: Implement
+#endif
+    unsigned int n_threads = std::thread::hardware_concurrency();
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
+
+std::string basename(const std::string& path) {
+    size_t pos = path.find_last_of('/');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    pos = path.find_last_of('\\');
+    if (pos != std::string::npos) {
+        return path.substr(pos + 1);
+    }
+    return path;
+}
+
+std::string path_join(const std::string& p1, const std::string& p2) {
+    if (p1.empty()) {
+        return p2;
+    }
+
+    if (p2.empty()) {
+        return p1;
+    }
+
+    if (p1[p1.length() - 1] == '/' || p1[p1.length() - 1] == '\\') {
+        return p1 + p2;
+    }
+
+    return p1 + "/" + p2;
+}
+
+static SDLogLevel log_level = SDLogLevel::INFO;
+
+void set_sd_log_level(SDLogLevel level) {
+    log_level = level;
+}
+
+void log_printf(SDLogLevel level, const char* file, int line, const char* format, ...) {
+    if (level < log_level) {
+        return;
+    }
+    va_list args;
+    va_start(args, format);
+
+    if (level == SDLogLevel::DEBUG) {
+        printf("[DEBUG] %s:%-4d - ", basename(file).c_str(), line);
+        vprintf(format, args);
+        printf("\n");
+        fflush(stdout);
+    } else if (level == SDLogLevel::INFO) {
+        printf("[INFO]  %s:%-4d - ", basename(file).c_str(), line);
+        vprintf(format, args);
+        printf("\n");
+        fflush(stdout);
+    } else if (level == SDLogLevel::WARN) {
+        fprintf(stdout, "[WARN]  %s:%-4d - ", basename(file).c_str(), line);
+        vfprintf(stdout, format, args);
+        fprintf(stdout, "\n");
+        fflush(stdout);
+    } else {
+        fprintf(stderr, "[ERROR] %s:%-4d - ", basename(file).c_str(), line);
+        vfprintf(stderr, format, args);
+        fprintf(stderr, "\n");
+        fflush(stderr);
+    }
+
+    va_end(args);
+}
--- a/util.h
+++ b/util.h
@@ -0,0 +1,37 @@
+#ifndef __UTIL_H__
+#define __UTIL_H__
+
+#include <string>
+
+bool ends_with(const std::string& str, const std::string& ending);
+bool starts_with(const std::string& str, const std::string& start);
+
+std::string format(const char* fmt, ...);
+
+void replace_all_chars(std::string& str, char target, char replacement);
+
+bool file_exists(const std::string& filename);
+bool is_directory(const std::string& path);
+
+std::string basename(const std::string& path);
+
+std::string path_join(const std::string& p1, const std::string& p2);
+
+int32_t get_num_physical_cores();
+
+enum SDLogLevel {
+    DEBUG,
+    INFO,
+    WARN,
+    ERROR
+};
+
+void set_sd_log_level(SDLogLevel level);
+
+void log_printf(SDLogLevel level, const char* file, int line, const char* format, ...);
+
+#define LOG_DEBUG(format, ...) log_printf(SDLogLevel::DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#define LOG_INFO(format, ...) log_printf(SDLogLevel::INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#define LOG_WARN(format, ...) log_printf(SDLogLevel::WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#define LOG_ERROR(format, ...) log_printf(SDLogLevel::ERROR, __FILE__, __LINE__, format, ##__VA_ARGS__)
+#endif  // __UTIL_H__
--- a/examples/convert/vocab.hpp
+++ b/examples/convert/vocab.hpp