feat: ggml-alloc integration and gpu acceleration (#75)

* set ggml url to FSSRepo/ggml * ggml-alloc integration * offload all functions to gpu * gguf format + native converter * merge custom vae to a model * full offload to gpu * improve pretty progress --------- Co-authored-by: leejet <leejet714@gmail.com>
2023-11-26 06:02:36 -05:00
parent c874063408
commit 8124588cf1
29 changed files with 120774 additions and 2754 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,12 @@
 build*/
 test/
-
+.vscode/
 .cache/
 *.swp
 .vscode/
+*.bat
+*.bin
+*.exe
+*.gguf
+output.png
+models/*
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "ggml"]
-	path = ggml
-	url = https://github.com/leejet/ggml.git
+    path = ggml
+	url = https://github.com/FSSRepo/ggml.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,10 +24,24 @@ endif()
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
+option(SD_CUBLAS                     "sd: cuda backend" OFF)
+option(SD_FLASH_ATTN            "sd: use flash attention for x4 less memory usage" OFF)
 option(BUILD_SHARED_LIBS             "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)

+if(SD_CUBLAS)
+	message("Use CUBLAS as backend stable-diffusion")
+    set(GGML_CUBLAS ON)
+    add_definitions(-DSD_USE_CUBLAS)
+endif()

+if(SD_FLASH_ATTN)
+    message("Use Flash Attention for memory optimization")
+    add_definitions(-DSD_USE_FLASH_ATTENTION)
+endif()
+
+
+set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 # deps
 add_subdirectory(ggml)

@@ -38,6 +52,7 @@ target_link_libraries(${SD_LIB} PUBLIC ggml)
 target_include_directories(${SD_LIB} PUBLIC .)
 target_compile_features(${SD_LIB} PUBLIC cxx_std_11)

+add_subdirectory(common)

 if (SD_BUILD_EXAMPLES)
    add_subdirectory(examples)
--- a/README.md
+++ b/README.md
@@ -9,17 +9,20 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
 ## Features

 - Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Super lightweight and without external dependencies.
 - 16-bit, 32-bit float support
 - 4-bit, 5-bit and 8-bit integer quantization support
 - Accelerated memory-efficient CPU inference
-    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image
+    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
 - AVX, AVX2 and AVX512 support for x86 architectures
 - SD1.x and SD2.x support
+- Full CUDA backend for GPU acceleration, for now just for float16 and float32 models. There are some issues with quantized models and CUDA; it will be fixed in the future.
+- Flash Attention for memory usage optimization (only cpu for now).
 - Original `txt2img` and `img2img` mode
 - Negative prompt
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
 - LoRA support, same as [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora)
- Latent Consistency Models support(LCM/LCM-LoRA)
+- Latent Consistency Models support (LCM/LCM-LoRA)
 - Sampling method
    - `Euler A`
    - `Euler`
@@ -40,10 +43,11 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
 ### TODO

 - [ ] More sampling methods
- [ ] GPU support
 - [ ] Make inference faster
    - The current implementation of ggml_conv_2d is slow and has high memory usage
 - [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
+- [ ] Implement BPE Tokenizer
+- [ ] Add [TAESD](https://github.com/madebyollin/taesd) for faster VAE decoding
 - [ ] k-quants support

 ## Usage
@@ -77,24 +81,20 @@ git submodule update
    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-nonema-pruned.safetensors
    ```

- convert weights to ggml model format
+- convert weights to gguf model format

    ```shell
-    cd models
-    pip install -r requirements.txt
-    # (optional) python convert_diffusers_to_original_stable_diffusion.py --model_path [path to diffusers weights] --checkpoint_path [path to weights]
-    python convert.py [path to weights] --out_type [output precision]
-    # For example, python convert.py sd-v1-4.ckpt --out_type f16
+    ./bin/convert sd-v1-4.ckpt -t f16
    ```

 ### Quantization

-You can specify the output model format using the --out_type parameter
+You can specify the output model format using the `--type` or `-t` parameter

 - `f16` for 16-bit floating-point
 - `f32` for 32-bit floating-point
- `q8_0` for 8-bit integer quantization 
- `q5_0` or `q5_1` for 5-bit integer quantization 
+- `q8_0` for 8-bit integer quantization
+- `q5_0` or `q5_1` for 5-bit integer quantization
 - `q4_0` or `q4_1` for 4-bit integer quantization

 ### Build
@@ -115,6 +115,24 @@ cmake .. -DGGML_OPENBLAS=ON
 cmake --build . --config Release
 ```

+##### Using CUBLAS
+
+This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
+
+```
+cmake .. -DSD_CUBLAS=ON
+cmake --build . --config Release
+```
+
+### Using Flash Attention
+
+Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUBLAS is enabled because the kernel implementation is missing.
+
+```
+cmake .. -DSD_FLASH_ATTN=ON
+cmake --build . --config Release
+```
+
 ### Run

 ```
@@ -141,6 +159,7 @@ arguments:
  --steps  STEPS                     number of sample steps (default: 20)
  --rng {std_default, cuda}          RNG (default: cuda)
  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
+  -b, --batch-count COUNT            number of images to generate.
  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)
  -v, --verbose                      print extra info
 ```
@@ -148,7 +167,7 @@ arguments:
 #### txt2img example

 ```
-./bin/sd -m ../models/sd-v1-4-ggml-model-f16.bin -p "a lovely cat"
+./bin/sd -m ../sd-v1-4-f16.gguf -p "a lovely cat"
 ```

 Using formats of different precisions will yield results of varying quality.
@@ -163,7 +182,7 @@ Using formats of different precisions will yield results of varying quality.


 ```
-./bin/sd --mode img2img -m ../models/sd-v1-4-ggml-model-f16.bin -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+./bin/sd --mode img2img -m ../models/sd-v1-4-f16.gguf -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
 ```

 <p align="center">
@@ -172,12 +191,11 @@ Using formats of different precisions will yield results of varying quality.

 #### with LoRA

- convert lora weights to ggml model format
+- convert lora weights to gguf model format

    ```shell
-    cd models
-    python convert.py [path to weights] --lora
-    # For example, python convert.py marblesh.safetensors
+    bin/convert [lora path] -t f16
+    # For example,  bin/convert marblesh.safetensors -t f16
    ```

 - You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.
@@ -187,10 +205,10 @@ Using formats of different precisions will yield results of varying quality.
 Here's a simple example:

 ```
-./bin/sd -m ../models/v1-5-pruned-emaonly-ggml-model-f16.bin -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
+./bin/sd -m ../models/v1-5-pruned-emaonly-f16.gguf -p "a lovely cat<lora:marblesh:1>" --lora-model-dir ../models
 ```

-`../models/marblesh-ggml-lora.bin` will be applied to the model
+`../models/marblesh.gguf` will be applied to the model

 #### LCM/LCM-LoRA

@@ -201,7 +219,7 @@ Here's a simple example:
 Here's a simple example:

 ```
-./bin/sd -m ../models/v1-5-pruned-emaonly-ggml-model-f16.bin -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
+./bin/sd -m ../models/v1-5-pruned-emaonly-f16.gguf -p "a lovely cat<lora:lcm-lora-sdv1-5:1>" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
 ```

 | without LCM-LoRA (--cfg-scale 7)  | with LCM-LoRA (--cfg-scale 1)  |
@@ -222,7 +240,7 @@ docker build -t sd .
 ```shell
 docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
 # For example
-# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4-ggml-model-f16.bin -p "a lovely cat" -v -o /output/output.png
+# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4-f16.gguf -p "a lovely cat" -v -o /output/output.png
 ```

 ## Memory/Disk Requirements
@@ -230,7 +248,8 @@ docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
 | precision | f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
 | ----         | ----  |----  |----  |----  |----  |----  |----  |
 |  **Disk**        | 2.7G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
-|  **Memory**(txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
+|  **Memory** (txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
+|  **Memory** (txt2img - 512 x 512) *with Flash Attention* | ~2.4G | ~1.9G | ~1.6G | ~1.5G | ~1.5G | ~1.5G | ~1.5G |

 ## Contributors

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(TARGET common)
+
+# json.hpp library from: https://github.com/nlohmann/json
+
+add_library(${TARGET} OBJECT common.cpp common.h stb_image.h stb_image_write.h json.hpp)
+
+target_include_directories(${TARGET} PUBLIC .)
+target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+
+# ZIP Library from: https://github.com/kuba--/zip
+
+set(Z_TARGET zip)
+add_library(${Z_TARGET} OBJECT zip.c zip.h miniz.h)
+target_include_directories(${Z_TARGET} PUBLIC .)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,20 +1,11 @@
-#include <stdio.h>
-#include <ctime>
+#include "common.h"
+#include <cstring>
 #include <fstream>
 #include <iostream>
-#include <random>
 #include <string>
 #include <thread>
 #include <unordered_set>
-
-#include "stable-diffusion.h"
-
-#define STB_IMAGE_IMPLEMENTATION
-#include "stb_image.h"
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#define STB_IMAGE_WRITE_STATIC
-#include "stb_image_write.h"
+#include <vector>

 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/sysctl.h>
@@ -26,9 +17,6 @@
 #include <unistd.h>
 #endif

-#define TXT2IMG "txt2img"
-#define IMG2IMG "img2img"
-
 // get_num_physical_cores is copy from
 // https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
 // LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
@@ -72,7 +60,7 @@ const char* rng_type_to_str[] = {
    "cuda",
 };

-// Names of the sampler method, same order as enum SampleMethod in stable-diffusion.h
+// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
 const char* sample_method_str[] = {
    "euler_a",
    "euler",
@@ -84,53 +72,36 @@ const char* sample_method_str[] = {
    "lcm",
 };

-// Names of the sigma schedule overrides, same order as Schedule in stable-diffusion.h
+// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
 const char* schedule_str[] = {
    "default",
    "discrete",
    "karras"};

-struct Option {
-    int n_threads    = -1;
-    std::string mode = TXT2IMG;
-    std::string model_path;
-    std::string lora_model_dir;
-    std::string output_path = "output.png";
-    std::string init_img;
-    std::string prompt;
-    std::string negative_prompt;
-    float cfg_scale            = 7.0f;
-    int w                      = 512;
-    int h                      = 512;
-    SampleMethod sample_method = EULER_A;
-    Schedule schedule          = DEFAULT;
-    int sample_steps           = 20;
-    float strength             = 0.75f;
-    RNGType rng_type           = CUDA_RNG;
-    int64_t seed               = 42;
-    bool verbose               = false;
+const char* modes_str[] = {
+    "txt2img",
+    "img2img"};

-    void print() {
-        printf("Option: \n");
-        printf("    n_threads:       %d\n", n_threads);
-        printf("    mode:            %s\n", mode.c_str());
-        printf("    model_path:      %s\n", model_path.c_str());
-        printf("    lora_model_dir:  %s\n", lora_model_dir.c_str());
-        printf("    output_path:     %s\n", output_path.c_str());
-        printf("    init_img:        %s\n", init_img.c_str());
-        printf("    prompt:          %s\n", prompt.c_str());
-        printf("    negative_prompt: %s\n", negative_prompt.c_str());
-        printf("    cfg_scale:       %.2f\n", cfg_scale);
-        printf("    width:           %d\n", w);
-        printf("    height:          %d\n", h);
-        printf("    sample_method:   %s\n", sample_method_str[sample_method]);
-        printf("    schedule:        %s\n", schedule_str[schedule]);
-        printf("    sample_steps:    %d\n", sample_steps);
-        printf("    strength:        %.2f\n", strength);
-        printf("    rng:             %s\n", rng_type_to_str[rng_type]);
-        printf("    seed:            %ld\n", seed);
-    }
-};
+void print_params(SDParams params) {
+    printf("Option: \n");
+    printf("    n_threads:       %d\n", params.n_threads);
+    printf("    mode:            %s\n", modes_str[params.mode]);
+    printf("    model_path:      %s\n", params.model_path.c_str());
+    printf("    output_path:     %s\n", params.output_path.c_str());
+    printf("    init_img:        %s\n", params.input_path.c_str());
+    printf("    prompt:          %s\n", params.prompt.c_str());
+    printf("    negative_prompt: %s\n", params.negative_prompt.c_str());
+    printf("    cfg_scale:       %.2f\n", params.cfg_scale);
+    printf("    width:           %d\n", params.width);
+    printf("    height:          %d\n", params.height);
+    printf("    sample_method:   %s\n", sample_method_str[params.sample_method]);
+    printf("    schedule:        %s\n", schedule_str[params.schedule]);
+    printf("    sample_steps:    %d\n", params.sample_steps);
+    printf("    strength:        %.2f\n", params.strength);
+    printf("    rng:             %s\n", rng_type_to_str[params.rng_type]);
+    printf("    seed:            %ld\n", params.seed);
+    printf("    batch_count:     %d\n", params.batch_count);
+}

 void print_usage(int argc, const char* argv[]) {
    printf("usage: %s [arguments]\n", argv[0]);
@@ -143,7 +114,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("  -m, --model [MODEL]                path to model\n");
    printf("  --lora-model-dir [DIR]             lora model directory\n");
    printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
-    printf("  -o, --output OUTPUT                path to write result image to (default: .\\output.png)\n");
+    printf("  -o, --output OUTPUT                path to write result image to (default: ./output.png)\n");
    printf("  -p, --prompt [PROMPT]              the prompt to render\n");
    printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
    printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
@@ -156,95 +127,113 @@ void print_usage(int argc, const char* argv[]) {
    printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
    printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
    printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
+    printf("  -b, --batch-count COUNT            number of images to generate.\n");
    printf("  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)\n");
    printf("  -v, --verbose                      print extra info\n");
 }

-void parse_args(int argc, const char* argv[], Option* opt) {
+void parse_args(int argc, const char** argv, SDParams& params) {
    bool invalid_arg = false;
-
+    std::string arg;
    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
+        arg = argv[i];

        if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->n_threads = std::stoi(argv[i]);
+            params.n_threads = std::stoi(argv[i]);
        } else if (arg == "-M" || arg == "--mode") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->mode = argv[i];
-
+            const char* mode_selected = argv[i];
+            int mode_found            = -1;
+            for (int d = 0; d < MODE_COUNT; d++) {
+                if (!strcmp(mode_selected, modes_str[d])) {
+                    mode_found = d;
+                }
+            }
+            if (mode_found == -1) {
+                fprintf(stderr, "error: invalid mode %s, must be one of [txt2img, img2img]\n",
+                        mode_selected);
+                exit(1);
+            }
+            params.mode = (sd_mode)mode_found;
        } else if (arg == "-m" || arg == "--model") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->model_path = argv[i];
+            params.model_path = argv[i];
        } else if (arg == "--lora-model-dir") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->lora_model_dir = argv[i];
+            params.lora_model_dir = argv[i];
        } else if (arg == "-i" || arg == "--init-img") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->init_img = argv[i];
+            params.input_path = argv[i];
        } else if (arg == "-o" || arg == "--output") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->output_path = argv[i];
+            params.output_path = argv[i];
        } else if (arg == "-p" || arg == "--prompt") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->prompt = argv[i];
+            params.prompt = argv[i];
        } else if (arg == "-n" || arg == "--negative-prompt") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->negative_prompt = argv[i];
+            params.negative_prompt = argv[i];
        } else if (arg == "--cfg-scale") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->cfg_scale = std::stof(argv[i]);
+            params.cfg_scale = std::stof(argv[i]);
        } else if (arg == "--strength") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->strength = std::stof(argv[i]);
+            params.strength = std::stof(argv[i]);
        } else if (arg == "-H" || arg == "--height") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->h = std::stoi(argv[i]);
+            params.height = std::stoi(argv[i]);
        } else if (arg == "-W" || arg == "--width") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->w = std::stoi(argv[i]);
+            params.width = std::stoi(argv[i]);
        } else if (arg == "--steps") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->sample_steps = std::stoi(argv[i]);
+            params.sample_steps = std::stoi(argv[i]);
+        } else if (arg == "-b" || arg == "--batch-count") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.batch_count = std::stoi(argv[i]);
        } else if (arg == "--rng") {
            if (++i >= argc) {
                invalid_arg = true;
@@ -252,9 +241,9 @@ void parse_args(int argc, const char* argv[], Option* opt) {
            }
            std::string rng_type_str = argv[i];
            if (rng_type_str == "std_default") {
-                opt->rng_type = STD_DEFAULT_RNG;
+                params.rng_type = STD_DEFAULT_RNG;
            } else if (rng_type_str == "cuda") {
-                opt->rng_type = CUDA_RNG;
+                params.rng_type = CUDA_RNG;
            } else {
                invalid_arg = true;
                break;
@@ -275,13 +264,13 @@ void parse_args(int argc, const char* argv[], Option* opt) {
                invalid_arg = true;
                break;
            }
-            opt->schedule = (Schedule)schedule_found;
+            params.schedule = (Schedule)schedule_found;
        } else if (arg == "-s" || arg == "--seed") {
            if (++i >= argc) {
                invalid_arg = true;
                break;
            }
-            opt->seed = std::stoll(argv[i]);
+            params.seed = std::stoll(argv[i]);
        } else if (arg == "--sampling-method") {
            if (++i >= argc) {
                invalid_arg = true;
@@ -298,81 +287,74 @@ void parse_args(int argc, const char* argv[], Option* opt) {
                invalid_arg = true;
                break;
            }
-            opt->sample_method = (SampleMethod)sample_method_found;
+            params.sample_method = (SampleMethod)sample_method_found;
        } else if (arg == "-h" || arg == "--help") {
            print_usage(argc, argv);
            exit(0);
        } else if (arg == "-v" || arg == "--verbose") {
-            opt->verbose = true;
+            params.verbose = true;
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            print_usage(argc, argv);
            exit(1);
        }
-        if (invalid_arg) {
-            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-            print_usage(argc, argv);
-            exit(1);
-        }
    }
-
-    if (opt->n_threads <= 0) {
-        opt->n_threads = get_num_physical_cores();
-    }
-
-    if (opt->mode != TXT2IMG && opt->mode != IMG2IMG) {
-        fprintf(stderr, "error: invalid mode %s, must be one of ['%s', '%s']\n",
-                opt->mode.c_str(), TXT2IMG, IMG2IMG);
+    if (invalid_arg) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv);
        exit(1);
    }
+    if (params.n_threads <= 0) {
+        params.n_threads = get_num_physical_cores();
+    }

-    if (opt->prompt.length() == 0) {
+    if (params.prompt.length() == 0) {
        fprintf(stderr, "error: the following arguments are required: prompt\n");
        print_usage(argc, argv);
        exit(1);
    }

-    if (opt->model_path.length() == 0) {
+    if (params.model_path.length() == 0) {
        fprintf(stderr, "error: the following arguments are required: model_path\n");
        print_usage(argc, argv);
        exit(1);
    }

-    if (opt->mode == IMG2IMG && opt->init_img.length() == 0) {
+    if (params.mode == IMG2IMG && params.input_path.length() == 0) {
        fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
        print_usage(argc, argv);
        exit(1);
    }

-    if (opt->output_path.length() == 0) {
+    if (params.output_path.length() == 0) {
        fprintf(stderr, "error: the following arguments are required: output_path\n");
        print_usage(argc, argv);
        exit(1);
    }

-    if (opt->w <= 0 || opt->w % 64 != 0) {
+    if (params.width <= 0 || params.width % 64 != 0) {
        fprintf(stderr, "error: the width must be a multiple of 64\n");
        exit(1);
    }

-    if (opt->h <= 0 || opt->h % 64 != 0) {
+    if (params.height <= 0 || params.height % 64 != 0) {
        fprintf(stderr, "error: the height must be a multiple of 64\n");
        exit(1);
    }

-    if (opt->sample_steps <= 0) {
+    if (params.sample_steps <= 0) {
        fprintf(stderr, "error: the sample_steps must be greater than 0\n");
        exit(1);
    }

-    if (opt->strength < 0.f || opt->strength > 1.f) {
+    if (params.strength < 0.f || params.strength > 1.f) {
        fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
        exit(1);
    }

-    if (opt->seed < 0) {
+    if (params.seed < 0) {
        srand((int)time(NULL));
-        opt->seed = rand();
+        params.seed = rand();
    }
 }

@@ -388,98 +370,22 @@ std::string basename(const std::string& path) {
    return path;
 }

-int main(int argc, const char* argv[]) {
-    Option opt;
-    parse_args(argc, argv, &opt);
-
-    if (opt.verbose) {
-        opt.print();
-        printf("%s", sd_get_system_info().c_str());
-        set_sd_log_level(SDLogLevel::DEBUG);
+const char* get_image_params(SDParams params, int seed) {
+    std::string parameter_string = params.prompt + "\n";
+    if (params.negative_prompt.size() != 0) {
+        parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
    }
-
-    bool vae_decode_only = true;
-    std::vector<uint8_t> init_img;
-    if (opt.mode == IMG2IMG) {
-        vae_decode_only = false;
-
-        int c = 0;
-
-        unsigned char* img_data = stbi_load(opt.init_img.c_str(), &opt.w, &opt.h, &c, 3);
-        if (img_data == NULL) {
-            fprintf(stderr, "load image from '%s' failed\n", opt.init_img.c_str());
-            return 1;
-        }
-        if (c != 3) {
-            fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
-            free(img_data);
-            return 1;
-        }
-        if (opt.w <= 0 || opt.w % 64 != 0) {
-            fprintf(stderr, "error: the width of image must be a multiple of 64\n");
-            free(img_data);
-            return 1;
-        }
-        if (opt.h <= 0 || opt.h % 64 != 0) {
-            fprintf(stderr, "error: the height of image must be a multiple of 64\n");
-            free(img_data);
-            return 1;
-        }
-        init_img.assign(img_data, img_data + (opt.w * opt.h * c));
-    }
-
-    StableDiffusion sd(opt.n_threads, vae_decode_only, true, opt.lora_model_dir, opt.rng_type);
-    if (!sd.load_from_file(opt.model_path, opt.schedule)) {
-        return 1;
-    }
-
-    std::vector<uint8_t> img;
-    if (opt.mode == TXT2IMG) {
-        img = sd.txt2img(opt.prompt,
-                         opt.negative_prompt,
-                         opt.cfg_scale,
-                         opt.w,
-                         opt.h,
-                         opt.sample_method,
-                         opt.sample_steps,
-                         opt.seed);
-    } else {
-        img = sd.img2img(init_img,
-                         opt.prompt,
-                         opt.negative_prompt,
-                         opt.cfg_scale,
-                         opt.w,
-                         opt.h,
-                         opt.sample_method,
-                         opt.sample_steps,
-                         opt.strength,
-                         opt.seed);
-    }
-
-    if (img.size() == 0) {
-        fprintf(stderr, "generate failed\n");
-        return 1;
-    }
-
-    std::string parameter_string = opt.prompt + "\n";
-    if (opt.negative_prompt.size() != 0) {
-        parameter_string += "Negative prompt: " + opt.negative_prompt + "\n";
-    }
-    parameter_string += "Steps: " + std::to_string(opt.sample_steps) + ", ";
-    parameter_string += "CFG scale: " + std::to_string(opt.cfg_scale) + ", ";
-    parameter_string += "Seed: " + std::to_string(opt.seed) + ", ";
-    parameter_string += "Size: " + std::to_string(opt.w) + "x" + std::to_string(opt.h) + ", ";
-    parameter_string += "Model: " + basename(opt.model_path) + ", ";
-    parameter_string += "RNG: " + std::string(rng_type_to_str[opt.rng_type]) + ", ";
-    parameter_string += "Sampler: " + std::string(sample_method_str[opt.sample_method]);
-    if (opt.schedule == KARRAS) {
+    parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
+    parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
+    parameter_string += "Seed: " + std::to_string(seed) + ", ";
+    parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
+    parameter_string += "Model: " + basename(params.model_path) + ", ";
+    parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
+    parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
+    if (params.schedule == KARRAS) {
        parameter_string += " karras";
    }
    parameter_string += ", ";
    parameter_string += "Version: stable-diffusion.cpp";
-
-    stbi_write_png(opt.output_path.c_str(), opt.w, opt.h, 3, img.data(), 0, parameter_string.c_str());
-    printf("save result image to '%s'\n", opt.output_path.c_str());
-
-    return 0;
-}
+    return parameter_string.c_str();
+}
--- a/common/common.h
+++ b/common/common.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <string>
+#include "stable-diffusion.h"
+
+enum sd_mode {
+    TXT2IMG,
+    IMG2IMG,
+    MODE_COUNT
+};
+
+struct SDParams {
+    int n_threads = -1;
+    sd_mode mode  = TXT2IMG;
+
+    std::string model_path;
+    std::string lora_model_dir;
+    std::string output_path = "output.png";
+    std::string input_path;
+
+    std::string prompt;
+    std::string negative_prompt;
+    float cfg_scale = 7.0f;
+    int width       = 512;
+    int height      = 512;
+    int batch_count = 1;
+
+    SampleMethod sample_method = EULER_A;
+    Schedule schedule          = DEFAULT;
+    int sample_steps           = 20;
+    float strength             = 0.75f;
+    RNGType rng_type           = CUDA_RNG;
+    int64_t seed               = 42;
+    bool verbose               = false;
+};
+
+void print_params(SDParams params);
+
+void print_usage(int argc, const char* argv[]);
+
+void parse_args(int argc, const char** argv, SDParams& params);
+
+const char* get_image_params(SDParams params, int seed);
--- a/common/json.hpp
+++ b/common/json.hpp
--- a/common/miniz.h
+++ b/common/miniz.h
--- a/examples/stb_image.h
+++ b/examples/stb_image.h
--- a/examples/stb_image_write.h
+++ b/examples/stb_image_write.h
--- a/common/zip.c
+++ b/common/zip.c
--- a/common/zip.h
+++ b/common/zip.h
@@ -0,0 +1,509 @@
+/*
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#pragma once
+#ifndef ZIP_H
+#define ZIP_H
+
+#include <stdint.h>
+#include <string.h>
+#include <sys/types.h>
+
+#ifndef ZIP_SHARED
+#define ZIP_EXPORT
+#else
+#ifdef _WIN32
+#ifdef ZIP_BUILD_SHARED
+#define ZIP_EXPORT __declspec(dllexport)
+#else
+#define ZIP_EXPORT __declspec(dllimport)
+#endif
+#else
+#define ZIP_EXPORT __attribute__((visibility("default")))
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(_POSIX_C_SOURCE) && defined(_MSC_VER)
+// 64-bit Windows is the only mainstream platform
+// where sizeof(long) != sizeof(void*)
+#ifdef _WIN64
+typedef long long ssize_t; /* byte count or error */
+#else
+typedef long ssize_t; /* byte count or error */
+#endif
+#endif
+
+/**
+ * @mainpage
+ *
+ * Documentation for @ref zip.
+ */
+
+/**
+ * @addtogroup zip
+ * @{
+ */
+
+/**
+ * Default zip compression level.
+ */
+#define ZIP_DEFAULT_COMPRESSION_LEVEL 6
+
+/**
+ * Error codes
+ */
+#define ZIP_ENOINIT -1      // not initialized
+#define ZIP_EINVENTNAME -2  // invalid entry name
+#define ZIP_ENOENT -3       // entry not found
+#define ZIP_EINVMODE -4     // invalid zip mode
+#define ZIP_EINVLVL -5      // invalid compression level
+#define ZIP_ENOSUP64 -6     // no zip 64 support
+#define ZIP_EMEMSET -7      // memset error
+#define ZIP_EWRTENT -8      // cannot write data to entry
+#define ZIP_ETDEFLINIT -9   // cannot initialize tdefl compressor
+#define ZIP_EINVIDX -10     // invalid index
+#define ZIP_ENOHDR -11      // header not found
+#define ZIP_ETDEFLBUF -12   // cannot flush tdefl buffer
+#define ZIP_ECRTHDR -13     // cannot create entry header
+#define ZIP_EWRTHDR -14     // cannot write entry header
+#define ZIP_EWRTDIR -15     // cannot write to central dir
+#define ZIP_EOPNFILE -16    // cannot open file
+#define ZIP_EINVENTTYPE -17 // invalid entry type
+#define ZIP_EMEMNOALLOC -18 // extracting data using no memory allocation
+#define ZIP_ENOFILE -19     // file not found
+#define ZIP_ENOPERM -20     // no permission
+#define ZIP_EOOMEM -21      // out of memory
+#define ZIP_EINVZIPNAME -22 // invalid zip archive name
+#define ZIP_EMKDIR -23      // make dir error
+#define ZIP_ESYMLINK -24    // symlink error
+#define ZIP_ECLSZIP -25     // close archive error
+#define ZIP_ECAPSIZE -26    // capacity size too small
+#define ZIP_EFSEEK -27      // fseek error
+#define ZIP_EFREAD -28      // fread error
+#define ZIP_EFWRITE -29     // fwrite error
+#define ZIP_ERINIT -30      // cannot initialize reader
+#define ZIP_EWINIT -31      // cannot initialize writer
+#define ZIP_EWRINIT -32     // cannot initialize writer from reader
+
+/**
+ * Looks up the error message string corresponding to an error number.
+ * @param errnum error number
+ * @return error message string corresponding to errnum or NULL if error is not
+ * found.
+ */
+extern ZIP_EXPORT const char *zip_strerror(int errnum);
+
+/**
+ * @struct zip_t
+ *
+ * This data structure is used throughout the library to represent zip archive -
+ * forward declaration.
+ */
+struct zip_t;
+
+/**
+ * Opens zip archive with compression level using the given mode.
+ *
+ * @param zipname zip archive file name.
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *zip_open(const char *zipname, int level,
+                                         char mode);
+
+/**
+ * Opens zip archive with compression level using the given mode.
+ * The function additionally returns @param errnum -
+ *
+ * @param zipname zip archive file name.
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ * @param errnum 0 on success, negative number (< 0) on error.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *
+zip_openwitherror(const char *zipname, int level, char mode, int *errnum);
+
+/**
+ * Closes the zip archive, releases resources - always finalize.
+ *
+ * @param zip zip archive handler.
+ */
+extern ZIP_EXPORT void zip_close(struct zip_t *zip);
+
+/**
+ * Determines if the archive has a zip64 end of central directory headers.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - 1 (true), 0 (false), negative number (< 0) on
+ *         error.
+ */
+extern ZIP_EXPORT int zip_is64(struct zip_t *zip);
+
+/**
+ * Opens an entry by name in the zip archive.
+ *
+ * For zip archive opened in 'w' or 'a' mode the function will append
+ * a new entry. In readonly mode the function tries to locate the entry
+ * in global dictionary.
+ *
+ * @param zip zip archive handler.
+ * @param entryname an entry name in local dictionary.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_open(struct zip_t *zip, const char *entryname);
+
+/**
+ * Opens an entry by name in the zip archive.
+ *
+ * For zip archive opened in 'w' or 'a' mode the function will append
+ * a new entry. In readonly mode the function tries to locate the entry
+ * in global dictionary (case sensitive).
+ *
+ * @param zip zip archive handler.
+ * @param entryname an entry name in local dictionary (case sensitive).
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_opencasesensitive(struct zip_t *zip,
+                                                  const char *entryname);
+
+/**
+ * Opens a new entry by index in the zip archive.
+ *
+ * This function is only valid if zip archive was opened in 'r' (readonly) mode.
+ *
+ * @param zip zip archive handler.
+ * @param index index in local dictionary.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_openbyindex(struct zip_t *zip, size_t index);
+
+/**
+ * Closes a zip entry, flushes buffer and releases resources.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_close(struct zip_t *zip);
+
+/**
+ * Returns a local name of the current zip entry.
+ *
+ * The main difference between user's entry name and local entry name
+ * is optional relative path.
+ * Following .ZIP File Format Specification - the path stored MUST not contain
+ * a drive or device letter, or a leading slash.
+ * All slashes MUST be forward slashes '/' as opposed to backwards slashes '\'
+ * for compatibility with Amiga and UNIX file systems etc.
+ *
+ * @param zip: zip archive handler.
+ *
+ * @return the pointer to the current zip entry name, or NULL on error.
+ */
+extern ZIP_EXPORT const char *zip_entry_name(struct zip_t *zip);
+
+/**
+ * Returns an index of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the index on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entry_index(struct zip_t *zip);
+
+/**
+ * Determines if the current zip entry is a directory entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - 1 (true), 0 (false), negative number (< 0) on
+ *         error.
+ */
+extern ZIP_EXPORT int zip_entry_isdir(struct zip_t *zip);
+
+/**
+ * Returns the uncompressed size of the current zip entry.
+ * Alias for zip_entry_uncomp_size (for backward compatibility).
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the uncompressed size in bytes.
+ */
+extern ZIP_EXPORT unsigned long long zip_entry_size(struct zip_t *zip);
+
+/**
+ * Returns the uncompressed size of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the uncompressed size in bytes.
+ */
+extern ZIP_EXPORT unsigned long long zip_entry_uncomp_size(struct zip_t *zip);
+
+/**
+ * Returns the compressed size of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the compressed size in bytes.
+ */
+extern ZIP_EXPORT unsigned long long zip_entry_comp_size(struct zip_t *zip);
+
+/**
+ * Returns CRC-32 checksum of the current zip entry.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the CRC-32 checksum.
+ */
+extern ZIP_EXPORT unsigned int zip_entry_crc32(struct zip_t *zip);
+
+/**
+ * Compresses an input buffer for the current zip entry.
+ *
+ * @param zip zip archive handler.
+ * @param buf input buffer.
+ * @param bufsize input buffer size (in bytes).
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_write(struct zip_t *zip, const void *buf,
+                                      size_t bufsize);
+
+/**
+ * Compresses a file for the current zip entry.
+ *
+ * @param zip zip archive handler.
+ * @param filename input file.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_fwrite(struct zip_t *zip, const char *filename);
+
+/**
+ * Extracts the current zip entry into output buffer.
+ *
+ * The function allocates sufficient memory for a output buffer.
+ *
+ * @param zip zip archive handler.
+ * @param buf output buffer.
+ * @param bufsize output buffer size (in bytes).
+ *
+ * @note remember to release memory allocated for a output buffer.
+ *       for large entries, please take a look at zip_entry_extract function.
+ *
+ * @return the return code - the number of bytes actually read on success.
+ *         Otherwise a negative number (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entry_read(struct zip_t *zip, void **buf,
+                                         size_t *bufsize);
+
+/**
+ * Extracts the current zip entry into a memory buffer using no memory
+ * allocation.
+ *
+ * @param zip zip archive handler.
+ * @param buf preallocated output buffer.
+ * @param bufsize output buffer size (in bytes).
+ *
+ * @note ensure supplied output buffer is large enough.
+ *       zip_entry_size function (returns uncompressed size for the current
+ *       entry) can be handy to estimate how big buffer is needed.
+ *       For large entries, please take a look at zip_entry_extract function.
+ *
+ * @return the return code - the number of bytes actually read on success.
+ *         Otherwise a negative number (< 0) on error (e.g. bufsize is not large
+ * enough).
+ */
+extern ZIP_EXPORT ssize_t zip_entry_noallocread(struct zip_t *zip, void *buf,
+                                                size_t bufsize);
+
+/**
+ * Extracts the current zip entry into output file.
+ *
+ * @param zip zip archive handler.
+ * @param filename output file.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_entry_fread(struct zip_t *zip, const char *filename);
+
+/**
+ * Extracts the current zip entry using a callback function (on_extract).
+ *
+ * @param zip zip archive handler.
+ * @param on_extract callback function.
+ * @param arg opaque pointer (optional argument, which you can pass to the
+ *        on_extract callback)
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int
+zip_entry_extract(struct zip_t *zip,
+                  size_t (*on_extract)(void *arg, uint64_t offset,
+                                       const void *data, size_t size),
+                  void *arg);
+
+/**
+ * Returns the number of all entries (files and directories) in the zip archive.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return the return code - the number of entries on success, negative number
+ *         (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entries_total(struct zip_t *zip);
+
+/**
+ * Deletes zip archive entries.
+ *
+ * @param zip zip archive handler.
+ * @param entries array of zip archive entries to be deleted.
+ * @param len the number of entries to be deleted.
+ * @return the number of deleted entries, or negative number (< 0) on error.
+ */
+extern ZIP_EXPORT ssize_t zip_entries_delete(struct zip_t *zip,
+                                             char *const entries[], size_t len);
+
+/**
+ * Extracts a zip archive stream into directory.
+ *
+ * If on_extract is not NULL, the callback will be called after
+ * successfully extracted each zip entry.
+ * Returning a negative value from the callback will cause abort and return an
+ * error. The last argument (void *arg) is optional, which you can use to pass
+ * data to the on_extract callback.
+ *
+ * @param stream zip archive stream.
+ * @param size stream size.
+ * @param dir output directory.
+ * @param on_extract on extract callback.
+ * @param arg opaque pointer.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int
+zip_stream_extract(const char *stream, size_t size, const char *dir,
+                   int (*on_extract)(const char *filename, void *arg),
+                   void *arg);
+
+/**
+ * Opens zip archive stream into memory.
+ *
+ * @param stream zip archive stream.
+ * @param size stream size.
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *zip_stream_open(const char *stream, size_t size,
+                                                int level, char mode);
+
+/**
+ * Opens zip archive stream into memory.
+ * The function additionally returns @param errnum -
+ *
+ * @param stream zip archive stream.
+ * @param size stream size.*
+ * @param level compression level (0-9 are the standard zlib-style levels).
+ * @param mode file access mode.
+ *        - 'r': opens a file for reading/extracting (the file must exists).
+ *        - 'w': creates an empty file for writing.
+ *        - 'a': appends to an existing archive.
+ * @param errnum 0 on success, negative number (< 0) on error.
+ *
+ * @return the zip archive handler or NULL on error
+ */
+extern ZIP_EXPORT struct zip_t *zip_stream_openwitherror(const char *stream,
+                                                         size_t size, int level,
+                                                         char mode,
+                                                         int *errnum);
+
+/**
+ * Copy zip archive stream output buffer.
+ *
+ * @param zip zip archive handler.
+ * @param buf output buffer. User should free buf.
+ * @param bufsize output buffer size (in bytes).
+ *
+ * @return copy size
+ */
+extern ZIP_EXPORT ssize_t zip_stream_copy(struct zip_t *zip, void **buf,
+                                          size_t *bufsize);
+
+/**
+ * Close zip archive releases resources.
+ *
+ * @param zip zip archive handler.
+ *
+ * @return
+ */
+extern ZIP_EXPORT void zip_stream_close(struct zip_t *zip);
+
+/**
+ * Creates a new archive and puts files into a single zip archive.
+ *
+ * @param zipname zip archive file.
+ * @param filenames input files.
+ * @param len: number of input files.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_create(const char *zipname, const char *filenames[],
+                                 size_t len);
+
+/**
+ * Extracts a zip archive file into directory.
+ *
+ * If on_extract_entry is not NULL, the callback will be called after
+ * successfully extracted each zip entry.
+ * Returning a negative value from the callback will cause abort and return an
+ * error. The last argument (void *arg) is optional, which you can use to pass
+ * data to the on_extract_entry callback.
+ *
+ * @param zipname zip archive file.
+ * @param dir output directory.
+ * @param on_extract_entry on extract callback.
+ * @param arg opaque pointer.
+ *
+ * @return the return code - 0 on success, negative number (< 0) on error.
+ */
+extern ZIP_EXPORT int zip_extract(const char *zipname, const char *dir,
+                                  int (*on_extract_entry)(const char *filename,
+                                                          void *arg),
+                                  void *arg);
+/** @} */
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,8 +1,4 @@
-# TODO: move into its own subdirectoy
-# TODO: make stb libs a target (maybe common)
-set(SD_TARGET sd)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})

-add_executable(${SD_TARGET} main.cpp stb_image.h stb_image_write.h)
-install(TARGETS ${SD_TARGET} RUNTIME)
-target_link_libraries(${SD_TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${SD_TARGET} PUBLIC cxx_std_11)
+add_subdirectory(cli)
+add_subdirectory(convert)
--- a/examples/cli/CMakeLists.txt
+++ b/examples/cli/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(TARGET sd)
+
+add_executable(${TARGET} main.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE stable-diffusion common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <ctime>
+#include <random>
+#include "common.h"
+#include "stable-diffusion.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_STATIC
+#include "stb_image_write.h"
+
+int main(int argc, const char* argv[]) {
+    SDParams params;
+    parse_args(argc, argv, params);
+
+    if (params.verbose) {
+        print_params(params);
+        printf("%s", sd_get_system_info().c_str());
+        set_sd_log_level(SDLogLevel::DEBUG);
+    }
+
+    bool vae_decode_only        = true;
+    uint8_t* input_image_buffer = NULL;
+    if (params.mode == IMG2IMG) {
+        vae_decode_only = false;
+
+        int c              = 0;
+        input_image_buffer = stbi_load(params.input_path.c_str(), &params.width, &params.height, &c, 3);
+        if (input_image_buffer == NULL) {
+            fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str());
+            return 1;
+        }
+        if (c != 3) {
+            fprintf(stderr, "input image must be a 3 channels RGB image, but got %d channels\n", c);
+            free(input_image_buffer);
+            return 1;
+        }
+        if (params.width <= 0 || params.width % 64 != 0) {
+            fprintf(stderr, "error: the width of image must be a multiple of 64\n");
+            free(input_image_buffer);
+            return 1;
+        }
+        if (params.height <= 0 || params.height % 64 != 0) {
+            fprintf(stderr, "error: the height of image must be a multiple of 64\n");
+            free(input_image_buffer);
+            return 1;
+        }
+    }
+
+    StableDiffusion sd(params.n_threads, vae_decode_only, true, params.lora_model_dir, params.rng_type);
+    if (!sd.load_from_file(params.model_path, params.schedule)) {
+        return 1;
+    }
+
+    std::vector<uint8_t*> results;
+    if (params.mode == TXT2IMG) {
+        results = sd.txt2img(params.prompt,
+                             params.negative_prompt,
+                             params.cfg_scale,
+                             params.width,
+                             params.height,
+                             params.sample_method,
+                             params.sample_steps,
+                             params.seed,
+                             params.batch_count);
+    } else {
+        results = sd.img2img(input_image_buffer,
+                             params.prompt,
+                             params.negative_prompt,
+                             params.cfg_scale,
+                             params.width,
+                             params.height,
+                             params.sample_method,
+                             params.sample_steps,
+                             params.strength,
+                             params.seed);
+    }
+
+    if (results.size() == 0 || results.size() != params.batch_count) {
+        fprintf(stderr, "generate failed\n");
+        return 1;
+    }
+
+    size_t last            = params.output_path.find_last_of(".");
+    std::string dummy_name = last != std::string::npos ? params.output_path.substr(0, last) : params.output_path;
+    for (int i = 0; i < params.batch_count; i++) {
+        std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
+        stbi_write_png(final_image_path.c_str(), params.width, params.height, 3, results[i], 0, get_image_params(params, params.seed + i));
+        printf("save result image to '%s'\n", final_image_path.c_str());
+    }
+
+    return 0;
+}
--- a/examples/convert/CMakeLists.txt
+++ b/examples/convert/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET convert)
+
+add_executable(${TARGET} convert.cpp vocab.hpp)
+target_link_libraries(${TARGET} PRIVATE ggml zip ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
--- a/examples/convert/README.md
+++ b/examples/convert/README.md
@@ -0,0 +1,16 @@
+# Model Convert
+
+## Usage
+```
+usage: convert.exe [MODEL_PATH] --type [OUT_TYPE] [arguments]
+Model supported for conversion: .safetensors models or .ckpt checkpoints models
+
+arguments:
+  -h, --help                         show this help message and exit
+  -o, --out [FILENAME]               path or name to converted model
+  --vocab [FILENAME]                 path to custom vocab.json (usually unnecessary)
+  -v, --verbose                      print processing info - dev info
+  -l, --lora                         force read the model as a LoRA
+  --vae [FILENAME]                   merge a custom VAE
+  -t, --type [OUT_TYPE]              output format (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
+```
--- a/examples/convert/convert.cpp
+++ b/examples/convert/convert.cpp
--- a/examples/convert/vocab.hpp
+++ b/examples/convert/vocab.hpp
--- a/2
+++ b/2
--- a/models/.gitignore
+++ b/models/.gitignore
@@ -1,5 +0,0 @@
-*.bin
-*.ckpt
-*.safetensor
-*.safetensors
-*.log
--- a/models/README.md
+++ b/models/README.md
@@ -1,26 +0,0 @@
-# Model Convert Script
-
-## Requirements
-
- vocab.json, from https://huggingface.co/openai/clip-vit-large-patch14/raw/main/vocab.json
-
-
-```shell
-pip install -r requirements.txt
-```
-
-## Usage
-```
-usage: convert.py [-h] [--out_type {f32,f16,q4_0,q4_1,q5_0,q5_1,q8_0}] [--out_file OUT_FILE] model_path
-
-Convert Stable Diffuison model to GGML compatible file format
-
-positional arguments:
-  model_path            model file path (*.pth, *.pt, *.ckpt, *.safetensors)
-
-options:
-  -h, --help            show this help message and exit
-  --out_type {f32,f16,q4_0,q4_1,q5_0,q5_1,q8_0}
-                        output format (default: based on input)
-  --out_file OUT_FILE   path to write to; default: based on input and current working directory
-```
--- a/models/convert.py
+++ b/models/convert.py
@@ -1,483 +0,0 @@
-import struct
-import json
-import os
-
-import numpy as np
-import torch
-import re
-import safetensors.torch
-
-this_file_dir = os.path.dirname(__file__)
-vocab_dir = this_file_dir
-
-SD1 = 0
-SD2 = 1
-
-ggml_ftype_str_to_int = {
-    "f32": 0,
-    "f16": 1,
-    "q4_0": 2,
-    "q4_1": 3,
-    "q5_0": 8,
-    "q5_1": 9,
-    "q8_0": 7
-}
-
-ggml_ttype_str_to_int = {
-    "f32": 0,
-    "f16": 1,
-    "q4_0": 2,
-    "q4_1": 3,
-    "q5_0": 6,
-    "q5_1": 7,
-    "q8_0": 8
-}
-
-QK4_0 = 32
-def quantize_q4_0(x):
-    assert x.shape[-1] % QK4_0 == 0 and x.shape[-1] > QK4_0
-    x = x.reshape(-1, QK4_0)
-    max = np.take_along_axis(x, np.argmax(np.abs(x), axis=-1)[:, np.newaxis], axis=-1)
-    d = max / -8
-    qs = ((x / d) + 8).round().clip(min=0, max=15).astype(np.int8)
-    half = QK4_0 // 2
-    qs = qs[:, :half] | (qs[:, half:] << 4)
-    d = d.astype(np.float16).view(np.int8)
-    y = np.concatenate((d, qs), axis=-1)
-    return y
-
-QK4_1 = 32
-def quantize_q4_1(x):
-    assert x.shape[-1] % QK4_1 == 0 and x.shape[-1] > QK4_1
-    x = x.reshape(-1, QK4_1)
-    min = np.min(x, axis=-1, keepdims=True)
-    max = np.max(x, axis=-1, keepdims=True)
-    d = (max - min) / ((1 << 4) - 1)
-    qs = ((x - min) / d).round().clip(min=0, max=15).astype(np.int8)
-    half = QK4_1 // 2
-    qs = qs[:, :half] | (qs[:, half:] << 4)
-    d = d.astype(np.float16).view(np.int8)
-    m = min.astype(np.float16).view(np.int8)
-    y = np.concatenate((d, m, qs), axis=-1)
-    return y
-
-QK5_0 = 32
-def quantize_q5_0(x):
-    assert x.shape[-1] % QK5_0 == 0 and x.shape[-1] > QK5_0
-    x = x.reshape(-1, QK5_0)
-    max = np.take_along_axis(x, np.argmax(np.abs(x), axis=-1)[:, np.newaxis], axis=-1)
-    d = max / -16
-    xi = ((x / d) + 16).round().clip(min=0, max=31).astype(np.int8)
-    half = QK5_0 // 2
-    qs = (xi[:, :half] & 0x0F) | (xi[:, half:] << 4)
-    qh = np.zeros(qs.shape[:-1], dtype=np.int32)
-    for i in range(QK5_0):
-        qh |= ((xi[:, i] & 0x10) >> 4).astype(np.int32) << i
-    d = d.astype(np.float16).view(np.int8)
-    qh = qh[..., np.newaxis].view(np.int8)
-    y = np.concatenate((d, qh, qs), axis=-1)
-    return y
-
-QK5_1 = 32
-def quantize_q5_1(x):
-    assert x.shape[-1] % QK5_1 == 0 and x.shape[-1] > QK5_1
-    x = x.reshape(-1, QK5_1)
-    min = np.min(x, axis=-1, keepdims=True)
-    max = np.max(x, axis=-1, keepdims=True)
-    d = (max - min) / ((1 << 5) - 1)
-    xi = ((x - min) / d).round().clip(min=0, max=31).astype(np.int8)
-    half = QK5_1//2
-    qs = (xi[:, :half] & 0x0F) | (xi[:, half:] << 4)
-    qh = np.zeros(xi.shape[:-1], dtype=np.int32)
-    for i in range(QK5_1):
-        qh |= ((xi[:, i] & 0x10) >> 4).astype(np.int32) << i
-    d = d.astype(np.float16).view(np.int8)
-    m = min.astype(np.float16).view(np.int8)
-    qh = qh[..., np.newaxis].view(np.int8)
-    ndarray = np.concatenate((d, m, qh, qs), axis=-1)
-    return ndarray
-
-QK8_0 = 32
-def quantize_q8_0(x):
-    assert x.shape[-1] % QK8_0 == 0 and x.shape[-1] > QK8_0
-    x = x.reshape(-1, QK8_0)
-    amax = np.max(np.abs(x), axis=-1, keepdims=True)
-    d = amax / ((1 << 7) - 1)
-    qs = (x / d).round().clip(min=-128, max=127).astype(np.int8)
-    d = d.astype(np.float16).view(np.int8)
-    y = np.concatenate((d, qs), axis=-1)
-    return y
-
-# copy from https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py#L16
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-def load_model_from_file(model_path):
-    print("loading model from {}".format(model_path))
-    if model_path.lower().endswith(".safetensors"):
-        pl_sd = safetensors.torch.load_file(model_path, device="cpu")
-    else:
-        pl_sd = torch.load(model_path, map_location="cpu")
-    state_dict = pl_sd["state_dict"] if "state_dict" in pl_sd else pl_sd
-    print("loading model from {} completed".format(model_path))
-    return state_dict
-
-def get_alpha_comprod(linear_start=0.00085, linear_end=0.0120, timesteps=1000):
-    betas = torch.linspace(linear_start ** 0.5, linear_end ** 0.5, timesteps, dtype=torch.float32) ** 2
-    alphas = 1. - betas
-    alphas_cumprod = np.cumprod(alphas.numpy(), axis=0)
-    return torch.tensor(alphas_cumprod)
-
-unused_tensors = [
-    "betas",
-    "alphas_cumprod_prev",
-    "sqrt_alphas_cumprod",
-    "sqrt_one_minus_alphas_cumprod",
-    "log_one_minus_alphas_cumprod",
-    "sqrt_recip_alphas_cumprod",
-    "sqrt_recipm1_alphas_cumprod",
-    "posterior_variance",
-    "posterior_log_variance_clipped",
-    "posterior_mean_coef1",
-    "posterior_mean_coef2",
-    "cond_stage_model.transformer.text_model.embeddings.position_ids",
-    "cond_stage_model.model.logit_scale",
-    "cond_stage_model.model.text_projection",
-    "model_ema.decay",
-    "model_ema.num_updates",
-    "control_model",
-    "lora_te_text_model",
-    "embedding_manager"
-]
-
-
-def preprocess(state_dict):
-    alphas_cumprod = state_dict.get("alphas_cumprod")
-    if alphas_cumprod != None:
-        # print((np.abs(get_alpha_comprod().numpy() - alphas_cumprod.numpy()) < 0.000001).all())
-        pass
-    else:
-        print("no alphas_cumprod in file, generate new one")
-        alphas_cumprod = get_alpha_comprod()
-        state_dict["alphas_cumprod"] = alphas_cumprod
-
-    new_state_dict = {}
-    for name, w in state_dict.items():
-        # ignore unused tensors
-        if not isinstance(w, torch.Tensor):
-            continue
-        skip = False
-        for unused_tensor in unused_tensors:
-            if name.startswith(unused_tensor):
-                skip = True
-                break
-        if skip:
-            continue
-
-        # convert BF16 to FP16
-        if w.dtype == torch.bfloat16:
-            w = w.to(torch.float16)
-
-        # convert open_clip to hf CLIPTextModel (for SD2.x)
-        open_clip_to_hf_clip_model = {
-            "cond_stage_model.model.ln_final.bias": "cond_stage_model.transformer.text_model.final_layer_norm.bias",
-            "cond_stage_model.model.ln_final.weight": "cond_stage_model.transformer.text_model.final_layer_norm.weight",
-            "cond_stage_model.model.positional_embedding": "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight",
-            "cond_stage_model.model.token_embedding.weight": "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight",
-            "first_stage_model.decoder.mid.attn_1.to_k.bias": "first_stage_model.decoder.mid.attn_1.k.bias",
-            "first_stage_model.decoder.mid.attn_1.to_k.weight": "first_stage_model.decoder.mid.attn_1.k.weight",
-            "first_stage_model.decoder.mid.attn_1.to_out.0.bias": "first_stage_model.decoder.mid.attn_1.proj_out.bias",
-            "first_stage_model.decoder.mid.attn_1.to_out.0.weight": "first_stage_model.decoder.mid.attn_1.proj_out.weight",
-            "first_stage_model.decoder.mid.attn_1.to_q.bias": "first_stage_model.decoder.mid.attn_1.q.bias",
-            "first_stage_model.decoder.mid.attn_1.to_q.weight": "first_stage_model.decoder.mid.attn_1.q.weight",
-            "first_stage_model.decoder.mid.attn_1.to_v.bias": "first_stage_model.decoder.mid.attn_1.v.bias",
-            "first_stage_model.decoder.mid.attn_1.to_v.weight": "first_stage_model.decoder.mid.attn_1.v.weight",
-        }
-        open_clip_to_hk_clip_resblock = {
-            "attn.out_proj.bias": "self_attn.out_proj.bias",
-            "attn.out_proj.weight": "self_attn.out_proj.weight",
-            "ln_1.bias": "layer_norm1.bias",
-            "ln_1.weight": "layer_norm1.weight",
-            "ln_2.bias": "layer_norm2.bias",
-            "ln_2.weight": "layer_norm2.weight",
-            "mlp.c_fc.bias": "mlp.fc1.bias",
-            "mlp.c_fc.weight": "mlp.fc1.weight",
-            "mlp.c_proj.bias": "mlp.fc2.bias",
-            "mlp.c_proj.weight": "mlp.fc2.weight",
-        }
-        open_clip_resblock_prefix = "cond_stage_model.model.transformer.resblocks."
-        hf_clip_resblock_prefix = "cond_stage_model.transformer.text_model.encoder.layers."
-        if name in open_clip_to_hf_clip_model:
-            new_name = open_clip_to_hf_clip_model[name]
-            print(f"preprocess {name} => {new_name}")
-            name = new_name
-        if name.startswith(open_clip_resblock_prefix):
-            remain = name[len(open_clip_resblock_prefix):]
-            idx = remain.split(".")[0]
-            suffix = remain[len(idx)+1:]
-            if suffix == "attn.in_proj_weight":
-                w_q, w_k, w_v = w.chunk(3)
-                for new_suffix, new_w in zip(["self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight"], [w_q, w_k, w_v]):
-                    new_name = hf_clip_resblock_prefix + idx + "." + new_suffix
-                    new_state_dict[new_name] = new_w
-                    print(f"preprocess {name}{w.size()} => {new_name}{new_w.size()}")
-            elif suffix == "attn.in_proj_bias":
-                w_q, w_k, w_v = w.chunk(3)
-                for new_suffix, new_w in zip(["self_attn.q_proj.bias", "self_attn.k_proj.bias", "self_attn.v_proj.bias"], [w_q, w_k, w_v]):
-                    new_name = hf_clip_resblock_prefix + idx + "." + new_suffix
-                    new_state_dict[new_name] = new_w
-                    print(f"preprocess {name}{w.size()} => {new_name}{new_w.size()}")
-            else:
-                new_suffix = open_clip_to_hk_clip_resblock[suffix]
-                new_name = hf_clip_resblock_prefix + idx + "." + new_suffix
-                new_state_dict[new_name] = w
-                print(f"preprocess {name} => {new_name}")
-            continue
-
-        # convert unet transformer linear to conv2d 1x1
-        if name.startswith("model.diffusion_model.") and (name.endswith("proj_in.weight") or name.endswith("proj_out.weight")):
-            if len(w.shape) == 2:
-                new_w = w.unsqueeze(2).unsqueeze(3)
-                new_state_dict[name] = new_w
-                print(f"preprocess {name} {w.size()} => {name} {new_w.size()}")
-                continue
-
-        # convert vae attn block linear to conv2d 1x1
-        if name.startswith("first_stage_model.") and "attn_1" in name:
-            if len(w.shape) == 2:
-                new_w = w.unsqueeze(2).unsqueeze(3)
-                new_state_dict[name] = new_w
-                print(f"preprocess {name} {w.size()} => {name} {new_w.size()}")
-                continue
-
-        new_state_dict[name] = w
-    return new_state_dict
-
-re_digits = re.compile(r"\d+")
-re_x_proj = re.compile(r"(.*)_([qkv]_proj)$")
-re_compiled = {}
-
-suffix_conversion = {
-    "attentions": {},
-    "resnets": {
-        "conv1": "in_layers_2",
-        "conv2": "out_layers_3",
-        "norm1": "in_layers_0",
-        "norm2": "out_layers_0",
-        "time_emb_proj": "emb_layers_1",
-        "conv_shortcut": "skip_connection",
-    }
-}
-
-
-def convert_diffusers_name_to_compvis(key):
-    def match(match_list, regex_text):
-        regex = re_compiled.get(regex_text)
-        if regex is None:
-            regex = re.compile(regex_text)
-            re_compiled[regex_text] = regex
-
-        r = re.match(regex, key)
-        if not r:
-            return False
-
-        match_list.clear()
-        match_list.extend([int(x) if re.match(re_digits, x) else x for x in r.groups()])
-        return True
-
-    m = []
-
-    if match(m, r"lora_unet_conv_in(.*)"):
-        return f'model_diffusion_model_input_blocks_0_0{m[0]}'
-
-    if match(m, r"lora_unet_conv_out(.*)"):
-        return f'model_diffusion_model_out_2{m[0]}'
-
-    if match(m, r"lora_unet_time_embedding_linear_(\d+)(.*)"):
-        return f"model_diffusion_model_time_embed_{m[0] * 2 - 2}{m[1]}"
-
-    if match(m, r"lora_unet_down_blocks_(\d+)_(attentions|resnets)_(\d+)_(.+)"):
-        suffix = suffix_conversion.get(m[1], {}).get(m[3], m[3])
-        return f"model_diffusion_model_input_blocks_{1 + m[0] * 3 + m[2]}_{1 if m[1] == 'attentions' else 0}_{suffix}"
-
-    if match(m, r"lora_unet_mid_block_(attentions|resnets)_(\d+)_(.+)"):
-        suffix = suffix_conversion.get(m[0], {}).get(m[2], m[2])
-        return f"model_diffusion_model_middle_block_{1 if m[0] == 'attentions' else m[1] * 2}_{suffix}"
-
-    if match(m, r"lora_unet_up_blocks_(\d+)_(attentions|resnets)_(\d+)_(.+)"):
-        suffix = suffix_conversion.get(m[1], {}).get(m[3], m[3])
-        return f"model_diffusion_model_output_blocks_{m[0] * 3 + m[2]}_{1 if m[1] == 'attentions' else 0}_{suffix}"
-
-    if match(m, r"lora_unet_down_blocks_(\d+)_downsamplers_0_conv"):
-        return f"model_diffusion_model_input_blocks_{3 + m[0] * 3}_0_op"
-
-    if match(m, r"lora_unet_up_blocks_(\d+)_upsamplers_0_conv"):
-        return f"model_diffusion_model_output_blocks_{2 + m[0] * 3}_{2 if m[0]>0 else 1}_conv"
-
-    if match(m, r"lora_te_text_model_encoder_layers_(\d+)_(.+)"):
-        return f"cond_stage_model_transformer_text_model_encoder_layers_{m[0]}_{m[1]}"
-
-    return None
-
-def preprocess_lora(state_dict):
-    new_state_dict = {}
-    for name, w in state_dict.items():
-        if not isinstance(w, torch.Tensor):
-            continue
-
-        # convert BF16 to FP16
-        if w.dtype == torch.bfloat16:
-            w = w.to(torch.float16)
-
-        name_without_network_parts, network_part = name.split(".", 1)
-        new_name_without_network_parts = convert_diffusers_name_to_compvis(name_without_network_parts)
-        if new_name_without_network_parts == None:
-            raise Exception(f"unknown lora tensor: {name}")
-        new_name = new_name_without_network_parts + "." + network_part
-        print(f"preprocess {name} => {new_name}")
-        new_state_dict[new_name] = w
-    return new_state_dict
-
-def convert(model_path, out_type = None, out_file=None, lora=False):
-    # load model
-    if not lora:
-        with open(os.path.join(vocab_dir, "vocab.json"), encoding="utf-8") as f:
-            clip_vocab = json.load(f)
-
-    state_dict = load_model_from_file(model_path)
-    model_type = SD1 # lora only for SD1 now
-    if not lora and "cond_stage_model.model.token_embedding.weight" in state_dict.keys():
-        model_type = SD2
-        print("Stable diffuison 2.x")
-    else:
-        print("Stable diffuison 1.x")
-    if lora:
-        state_dict = preprocess_lora(state_dict)
-    else:
-        state_dict = preprocess(state_dict)
-
-    # output option
-    if lora:
-        out_type = "f16" # only f16 for now
-    if out_type == None:
-        weight = state_dict["model.diffusion_model.input_blocks.0.0.weight"].numpy()
-        if weight.dtype == np.float32:
-            out_type = "f32"
-        elif weight.dtype == np.float16:
-            out_type = "f16"
-        elif weight.dtype == np.float64:
-            out_type = "f32"
-        else:
-            raise Exception("unsupported weight type %s" % weight.dtype)
-    if out_file == None:
-        if lora:
-            out_file = os.path.splitext(os.path.basename(model_path))[0] + f"-ggml-lora.bin"
-        else:
-            out_file = os.path.splitext(os.path.basename(model_path))[0] + f"-ggml-model-{out_type}.bin"
-        out_file = os.path.join(os.getcwd(), out_file)
-    print(f"Saving GGML compatible file to {out_file}")
-
-    # convert and save
-    with open(out_file, "wb") as file:
-        # magic: ggml in hex
-        file.write(struct.pack("i", 0x67676D6C))
-        # model & file type
-        ftype = (model_type << 16) | ggml_ftype_str_to_int[out_type]
-        file.write(struct.pack("i", ftype))
-
-        # vocab
-        if not lora:
-            byte_encoder = bytes_to_unicode()
-            byte_decoder = {v: k for k, v in byte_encoder.items()}
-            file.write(struct.pack("i", len(clip_vocab)))
-            for key in clip_vocab:
-                text = bytearray([byte_decoder[c] for c in key])
-                file.write(struct.pack("i", len(text)))
-                file.write(text)
-
-        # weights
-        for name in state_dict.keys():
-            if not isinstance(state_dict[name], torch.Tensor):
-                continue
-            skip = False
-            for unused_tensor in unused_tensors:
-                if name.startswith(unused_tensor):
-                    skip = True
-                    break
-            if skip:
-                continue
-            if name in unused_tensors:
-                continue
-
-            data = state_dict[name].numpy()
-
-            n_dims = len(data.shape)
-            shape = data.shape
-            old_type = data.dtype
-
-            ttype = "f32"
-            if n_dims == 4 and not lora:
-                data = data.astype(np.float16)
-                ttype = "f16"
-            elif n_dims == 2 and name[-7:] == ".weight":
-                if out_type == "f32":
-                    data = data.astype(np.float32)
-                elif out_type == "f16":
-                    data = data.astype(np.float16)
-                elif out_type == "q4_0":
-                    data = quantize_q4_0(data)
-                elif out_type == "q4_1":
-                    data = quantize_q4_1(data)
-                elif out_type == "q5_0":
-                    data = quantize_q5_0(data)
-                elif out_type == "q5_1":
-                    data = quantize_q5_1(data)
-                elif out_type == "q8_0":
-                    data = quantize_q8_0(data)
-                else:
-                    raise Exception("invalid out_type {}".format(out_type))
-                ttype = out_type
-            else:
-                data = data.astype(np.float32)
-                ttype = "f32"
-
-            print("Processing tensor: {} with shape {}, {} -> {}".format(name, data.shape, old_type, ttype))
-
-            # header
-            name_bytes = name.encode("utf-8")
-            file.write(struct.pack("iii", n_dims, len(name_bytes), ggml_ttype_str_to_int[ttype]))
-            for i in range(n_dims):
-                file.write(struct.pack("i", shape[n_dims - 1 - i]))
-            file.write(name_bytes)
-            # data
-            data.tofile(file)
-        print("Convert done")
-        print(f"Saved GGML compatible file to {out_file}")
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Convert Stable Diffuison model to GGML compatible file format")
-    parser.add_argument("--out_type", choices=["f32", "f16", "q4_0", "q4_1", "q5_0", "q5_1", "q8_0"], help="output format (default: based on input)")
-    parser.add_argument("--out_file", help="path to write to; default: based on input and current working directory")
-    parser.add_argument("--lora", action='store_true', default = False, help="convert lora weight; default: false")
-    parser.add_argument("model_path", help="model file path (*.pth, *.pt, *.ckpt, *.safetensors)")
-    args = parser.parse_args()
-    convert(args.model_path, args.out_type, args.out_file, args.lora)
--- a/models/convert_diffusers_to_original_stable_diffusion.py
+++ b/models/convert_diffusers_to_original_stable_diffusion.py
@@ -1,335 +0,0 @@
-# Copy from https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
-# LICENSE: https://github.com/huggingface/diffusers/blob/main/LICENSE
-# Script for converting a HF Diffusers saved pipeline to a Stable Diffusion checkpoint.
-# *Only* converts the UNet, VAE, and Text Encoder.
-# Does not convert optimizer state or any other thing.
-
-import argparse
-import os.path as osp
-import re
-
-import torch
-from safetensors.torch import load_file, save_file
-
-
-# =================#
-# UNet Conversion #
-# =================#
-
-unet_conversion_map = [
-    # (stable-diffusion, HF Diffusers)
-    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
-    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
-    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
-    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
-    ("input_blocks.0.0.weight", "conv_in.weight"),
-    ("input_blocks.0.0.bias", "conv_in.bias"),
-    ("out.0.weight", "conv_norm_out.weight"),
-    ("out.0.bias", "conv_norm_out.bias"),
-    ("out.2.weight", "conv_out.weight"),
-    ("out.2.bias", "conv_out.bias"),
-]
-
-unet_conversion_map_resnet = [
-    # (stable-diffusion, HF Diffusers)
-    ("in_layers.0", "norm1"),
-    ("in_layers.2", "conv1"),
-    ("out_layers.0", "norm2"),
-    ("out_layers.3", "conv2"),
-    ("emb_layers.1", "time_emb_proj"),
-    ("skip_connection", "conv_shortcut"),
-]
-
-unet_conversion_map_layer = []
-# hardcoded number of downblocks and resnets/attentions...
-# would need smarter logic for other networks.
-for i in range(4):
-    # loop over downblocks/upblocks
-
-    for j in range(2):
-        # loop over resnets/attentions for downblocks
-        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
-        sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
-        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
-
-        if i < 3:
-            # no attention layers in down_blocks.3
-            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
-            sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
-            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
-
-    for j in range(3):
-        # loop over resnets/attentions for upblocks
-        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
-        sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
-        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
-
-        if i > 0:
-            # no attention layers in up_blocks.0
-            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
-            sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
-            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
-
-    if i < 3:
-        # no downsample in down_blocks.3
-        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
-        sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
-        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
-
-        # no upsample in up_blocks.3
-        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
-        sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
-        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
-
-hf_mid_atn_prefix = "mid_block.attentions.0."
-sd_mid_atn_prefix = "middle_block.1."
-unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
-
-for j in range(2):
-    hf_mid_res_prefix = f"mid_block.resnets.{j}."
-    sd_mid_res_prefix = f"middle_block.{2*j}."
-    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
-
-
-def convert_unet_state_dict(unet_state_dict):
-    # buyer beware: this is a *brittle* function,
-    # and correct output requires that all of these pieces interact in
-    # the exact order in which I have arranged them.
-    mapping = {k: k for k in unet_state_dict.keys()}
-    for sd_name, hf_name in unet_conversion_map:
-        mapping[hf_name] = sd_name
-    for k, v in mapping.items():
-        if "resnets" in k:
-            for sd_part, hf_part in unet_conversion_map_resnet:
-                v = v.replace(hf_part, sd_part)
-            mapping[k] = v
-    for k, v in mapping.items():
-        for sd_part, hf_part in unet_conversion_map_layer:
-            v = v.replace(hf_part, sd_part)
-        mapping[k] = v
-    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
-    return new_state_dict
-
-
-# ================#
-# VAE Conversion #
-# ================#
-
-vae_conversion_map = [
-    # (stable-diffusion, HF Diffusers)
-    ("nin_shortcut", "conv_shortcut"),
-    ("norm_out", "conv_norm_out"),
-    ("mid.attn_1.", "mid_block.attentions.0."),
-]
-
-for i in range(4):
-    # down_blocks have two resnets
-    for j in range(2):
-        hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
-        sd_down_prefix = f"encoder.down.{i}.block.{j}."
-        vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
-
-    if i < 3:
-        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
-        sd_downsample_prefix = f"down.{i}.downsample."
-        vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
-
-        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
-        sd_upsample_prefix = f"up.{3-i}.upsample."
-        vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
-
-    # up_blocks have three resnets
-    # also, up blocks in hf are numbered in reverse from sd
-    for j in range(3):
-        hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
-        sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
-        vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
-
-# this part accounts for mid blocks in both the encoder and the decoder
-for i in range(2):
-    hf_mid_res_prefix = f"mid_block.resnets.{i}."
-    sd_mid_res_prefix = f"mid.block_{i+1}."
-    vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
-
-
-vae_conversion_map_attn = [
-    # (stable-diffusion, HF Diffusers)
-    ("norm.", "group_norm."),
-    ("q.", "query."),
-    ("k.", "key."),
-    ("v.", "value."),
-    ("proj_out.", "proj_attn."),
-]
-
-
-def reshape_weight_for_sd(w):
-    # convert HF linear weights to SD conv2d weights
-    return w.reshape(*w.shape, 1, 1)
-
-
-def convert_vae_state_dict(vae_state_dict):
-    mapping = {k: k for k in vae_state_dict.keys()}
-    for k, v in mapping.items():
-        for sd_part, hf_part in vae_conversion_map:
-            v = v.replace(hf_part, sd_part)
-        mapping[k] = v
-    for k, v in mapping.items():
-        if "attentions" in k:
-            for sd_part, hf_part in vae_conversion_map_attn:
-                v = v.replace(hf_part, sd_part)
-            mapping[k] = v
-    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
-    weights_to_convert = ["q", "k", "v", "proj_out"]
-    for k, v in new_state_dict.items():
-        for weight_name in weights_to_convert:
-            if f"mid.attn_1.{weight_name}.weight" in k:
-                print(f"Reshaping {k} for SD format")
-                new_state_dict[k] = reshape_weight_for_sd(v)
-    return new_state_dict
-
-
-# =========================#
-# Text Encoder Conversion #
-# =========================#
-
-
-textenc_conversion_lst = [
-    # (stable-diffusion, HF Diffusers)
-    ("resblocks.", "text_model.encoder.layers."),
-    ("ln_1", "layer_norm1"),
-    ("ln_2", "layer_norm2"),
-    (".c_fc.", ".fc1."),
-    (".c_proj.", ".fc2."),
-    (".attn", ".self_attn"),
-    ("ln_final.", "transformer.text_model.final_layer_norm."),
-    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
-    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
-]
-protected = {re.escape(x[1]): x[0] for x in textenc_conversion_lst}
-textenc_pattern = re.compile("|".join(protected.keys()))
-
-# Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
-code2idx = {"q": 0, "k": 1, "v": 2}
-
-
-def convert_text_enc_state_dict_v20(text_enc_dict):
-    new_state_dict = {}
-    capture_qkv_weight = {}
-    capture_qkv_bias = {}
-    for k, v in text_enc_dict.items():
-        if (
-            k.endswith(".self_attn.q_proj.weight")
-            or k.endswith(".self_attn.k_proj.weight")
-            or k.endswith(".self_attn.v_proj.weight")
-        ):
-            k_pre = k[: -len(".q_proj.weight")]
-            k_code = k[-len("q_proj.weight")]
-            if k_pre not in capture_qkv_weight:
-                capture_qkv_weight[k_pre] = [None, None, None]
-            capture_qkv_weight[k_pre][code2idx[k_code]] = v
-            continue
-
-        if (
-            k.endswith(".self_attn.q_proj.bias")
-            or k.endswith(".self_attn.k_proj.bias")
-            or k.endswith(".self_attn.v_proj.bias")
-        ):
-            k_pre = k[: -len(".q_proj.bias")]
-            k_code = k[-len("q_proj.bias")]
-            if k_pre not in capture_qkv_bias:
-                capture_qkv_bias[k_pre] = [None, None, None]
-            capture_qkv_bias[k_pre][code2idx[k_code]] = v
-            continue
-
-        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k)
-        new_state_dict[relabelled_key] = v
-
-    for k_pre, tensors in capture_qkv_weight.items():
-        if None in tensors:
-            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
-        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
-        new_state_dict[relabelled_key + ".in_proj_weight"] = torch.cat(tensors)
-
-    for k_pre, tensors in capture_qkv_bias.items():
-        if None in tensors:
-            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
-        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
-        new_state_dict[relabelled_key + ".in_proj_bias"] = torch.cat(tensors)
-
-    return new_state_dict
-
-
-def convert_text_enc_state_dict(text_enc_dict):
-    return text_enc_dict
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("--model_path", default=None, type=str, required=True, help="Path to the model to convert.")
-    parser.add_argument("--checkpoint_path", default=None, type=str, required=True, help="Path to the output model.")
-    parser.add_argument("--half", action="store_true", help="Save weights in half precision.")
-    parser.add_argument(
-        "--use_safetensors", action="store_true", help="Save weights use safetensors, default is ckpt."
-    )
-
-    args = parser.parse_args()
-
-    assert args.model_path is not None, "Must provide a model path!"
-
-    assert args.checkpoint_path is not None, "Must provide a checkpoint path!"
-
-    # Path for safetensors
-    unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.safetensors")
-    vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.safetensors")
-    text_enc_path = osp.join(args.model_path, "text_encoder", "model.safetensors")
-
-    # Load models from safetensors if it exists, if it doesn't pytorch
-    if osp.exists(unet_path):
-        unet_state_dict = load_file(unet_path, device="cpu")
-    else:
-        unet_path = osp.join(args.model_path, "unet", "diffusion_pytorch_model.bin")
-        unet_state_dict = torch.load(unet_path, map_location="cpu")
-
-    if osp.exists(vae_path):
-        vae_state_dict = load_file(vae_path, device="cpu")
-    else:
-        vae_path = osp.join(args.model_path, "vae", "diffusion_pytorch_model.bin")
-        vae_state_dict = torch.load(vae_path, map_location="cpu")
-
-    if osp.exists(text_enc_path):
-        text_enc_dict = load_file(text_enc_path, device="cpu")
-    else:
-        text_enc_path = osp.join(args.model_path, "text_encoder", "pytorch_model.bin")
-        text_enc_dict = torch.load(text_enc_path, map_location="cpu")
-
-    # Convert the UNet model
-    unet_state_dict = convert_unet_state_dict(unet_state_dict)
-    unet_state_dict = {"model.diffusion_model." + k: v for k, v in unet_state_dict.items()}
-
-    # Convert the VAE model
-    vae_state_dict = convert_vae_state_dict(vae_state_dict)
-    vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
-
-    # Easiest way to identify v2.0 model seems to be that the text encoder (OpenCLIP) is deeper
-    is_v20_model = "text_model.encoder.layers.22.layer_norm2.bias" in text_enc_dict
-
-    if is_v20_model:
-        # Need to add the tag 'transformer' in advance so we can knock it out from the final layer-norm
-        text_enc_dict = {"transformer." + k: v for k, v in text_enc_dict.items()}
-        text_enc_dict = convert_text_enc_state_dict_v20(text_enc_dict)
-        text_enc_dict = {"cond_stage_model.model." + k: v for k, v in text_enc_dict.items()}
-    else:
-        text_enc_dict = convert_text_enc_state_dict(text_enc_dict)
-        text_enc_dict = {"cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items()}
-
-    # Put together new checkpoint
-    state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
-    if args.half:
-        state_dict = {k: v.half() for k, v in state_dict.items()}
-
-    if args.use_safetensors:
-        save_file(state_dict, args.checkpoint_path)
-    else:
-        state_dict = {"state_dict": state_dict}
-        torch.save(state_dict, args.checkpoint_path)
--- a/models/requirements.txt
+++ b/models/requirements.txt
@@ -1,4 +0,0 @@
-numpy
-torch
-safetensors
-pytorch_lightning
--- a/models/vocab.json
+++ b/models/vocab.json
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -48,7 +48,7 @@ public:
                    std::string lora_model_dir   = "",
                    RNGType rng_type             = STD_DEFAULT_RNG);
    bool load_from_file(const std::string& file_path, Schedule d = DEFAULT);
-    std::vector<uint8_t> txt2img(
+    std::vector<uint8_t*> txt2img(
        std::string prompt,
        std::string negative_prompt,
        float cfg_scale,
@@ -56,9 +56,11 @@ public:
        int height,
        SampleMethod sample_method,
        int sample_steps,
-        int64_t seed);
-    std::vector<uint8_t> img2img(
-        const std::vector<uint8_t>& init_img,
+        int64_t seed,
+        int batch_count);
+
+    std::vector<uint8_t*> img2img(
+        const uint8_t* init_img_data,
        std::string prompt,
        std::string negative_prompt,
        float cfg_scale,