feat: add convert api (#142)

2024-01-14 11:43:24 +08:00
parent 2b6ec97fe2
commit 5c614e4bc2
5 changed files with 167 additions and 25 deletions
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ cmake .. -DSD_METAL=ON
 cmake --build . --config Release
 ```

-### Using Flash Attention
+##### Using Flash Attention

 Enabling flash attention reduces memory usage by at least 400 MB. At the moment, it is not supported when CUBLAS is enabled because the kernel implementation is missing.

@@ -142,7 +142,7 @@ usage: ./bin/sd [arguments]

 arguments:
  -h, --help                         show this help message and exit
-  -M, --mode [txt2img or img2img]    generation mode (default: txt2img)
+  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)
  -t, --threads N                    number of threads to use during computation (default: -1).
                                     If threads <= 0, then threads will be set to the number of CPU physical cores
  -m, --model [MODEL]                path to model
@@ -168,7 +168,8 @@ arguments:
  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
  -b, --batch-count COUNT            number of images to generate.
  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)
-  --clip-skip N                      number of layers to skip of clip model (default: 0)
+  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
+                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
  --vae-tiling                       process vae in tiles to reduce memory usage
  -v, --verbose                      print extra info
 ```
@@ -183,6 +184,16 @@ You can specify the model weight type using the `--type` parameter. The weights
 - `q5_0` or `q5_1` for 5-bit integer quantization
 - `q4_0` or `q4_1` for 4-bit integer quantization

+#### Convert to GGUF
+
+You can also convert weights in the formats `ckpt/safetensors/diffusers` to gguf and perform quantization in advance, avoiding the need for quantization every time you load them.
+
+For example:
+
+```sh
+./bin/sd -M convert -m ../models/v1-5-pruned-emaonly.safetensors -o  ../models/v1-5-pruned-emaonly.q8_0.gguf -v --type q8_0
+```
+
 #### txt2img example

 ```sh
@@ -240,7 +251,7 @@ Here's a simple example:
 | ----  |----    |
 | ![](./assets/without_lcm.png) |![](./assets/with_lcm.png)  |

-## Using TAESD to faster decoding
+#### Using TAESD to faster decoding

 You can use TAESD to accelerate the decoding of latent images by following these steps:

@@ -258,7 +269,7 @@ curl -L -O https://huggingface.co/madebyollin/taesd/blob/main/diffusion_pytorch_
 sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --taesd ../models/diffusion_pytorch_model.safetensors
 ```

-## Using ESRGAN to upscale results
+#### Using ESRGAN to upscale results

 You can use ESRGAN to upscale the generated images. At the moment, only the [RealESRGAN_x4plus_anime_6B.pth](https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth) model is supported. Support for more models of this architecture will be added soon.

--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -42,11 +42,13 @@ const char* schedule_str[] = {
 const char* modes_str[] = {
    "txt2img",
    "img2img",
+    "convert",
 };

 enum SDMode {
    TXT2IMG,
    IMG2IMG,
+    CONVERT,
    MODE_COUNT
 };

@@ -125,7 +127,7 @@ void print_usage(int argc, const char* argv[]) {
    printf("\n");
    printf("arguments:\n");
    printf("  -h, --help                         show this help message and exit\n");
-    printf("  -M, --mode [txt2img or img2img]    generation mode (default: txt2img)\n");
+    printf("  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)\n");
    printf("  -t, --threads N                    number of threads to use during computation (default: -1).\n");
    printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
    printf("  -m, --model [MODEL]                path to model\n");
@@ -384,7 +386,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        params.n_threads = get_num_physical_cores();
    }

-    if (params.prompt.length() == 0) {
+    if (params.mode != CONVERT && params.prompt.length() == 0) {
        fprintf(stderr, "error: the following arguments are required: prompt\n");
        print_usage(argc, argv);
        exit(1);
@@ -432,6 +434,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
        srand((int)time(NULL));
        params.seed = rand();
    }
+
+    if (params.mode == CONVERT) {
+        if (params.output_path == "output.png") {
+            params.output_path = "output.gguf";
+        }
+    }
 }

 std::string get_image_params(SDParams params, int64_t seed) {
@@ -479,6 +487,24 @@ int main(int argc, const char* argv[]) {
        printf("%s", sd_get_system_info());
    }

+    if (params.mode == CONVERT) {
+        bool success = convert(params.model_path.c_str(), params.vae_path.c_str(), params.output_path.c_str(), params.wtype);
+        if (!success) {
+            fprintf(stderr,
+                    "convert '%s'/'%s' to '%s' failed\n",
+                    params.model_path.c_str(),
+                    params.vae_path.c_str(),
+                    params.output_path.c_str());
+            return 1;
+        } else {
+            printf("convert '%s'/'%s' to '%s' success\n",
+                   params.model_path.c_str(),
+                   params.vae_path.c_str(),
+                   params.output_path.c_str());
+            return 0;
+        }
+    }
+
    bool vae_decode_only        = true;
    uint8_t* input_image_buffer = NULL;
    if (params.mode == IMG2IMG) {
--- a/model.cpp
+++ b/model.cpp
@@ -15,6 +15,8 @@
 #include "ggml/ggml-backend.h"
 #include "ggml/ggml.h"

+#include "stable-diffusion.h"
+
 #ifdef SD_USE_METAL
 #include "ggml-metal.h"
 #endif
@@ -609,7 +611,7 @@ bool is_safetensors_file(const std::string& file_path) {
    }

    size_t header_size_ = read_u64(header_size_buf);
-    if (header_size_ >= file_size_) {
+    if (header_size_ >= file_size_ || header_size_ <= 2) {
        return false;
    }

@@ -1181,6 +1183,9 @@ SDVersion ModelLoader::get_sd_version() {
        if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) {
            return VERSION_XL;
        }
+        if (tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
+            return VERSION_XL;
+        }
        if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
            tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
            tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
@@ -1218,7 +1223,35 @@ std::string ModelLoader::load_merges() {
    return merges_utf8_str;
 }

+void remove_duplicates(std::vector<TensorStorage>& vec) {
+    std::unordered_map<std::string, size_t> name_to_index_map;
+
+    for (size_t i = 0; i < vec.size(); ++i) {
+        const std::string& current_name = vec[i].name;
+        auto it                         = name_to_index_map.find(current_name);
+
+        if (it != name_to_index_map.end()) {
+            vec[it->second] = vec[i];
+        } else {
+            name_to_index_map[current_name] = i;
+        }
+    }
+
+    vec.resize(name_to_index_map.size());
+}
+
 bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend) {
+    std::vector<TensorStorage> processed_tensor_storages;
+    for (auto& tensor_storage : tensor_storages) {
+        // LOG_DEBUG("%s", name.c_str());
+
+        if (is_unused_tensor(tensor_storage.name)) {
+            continue;
+        }
+
+        preprocess_tensor(tensor_storage, processed_tensor_storages);
+    }
+    remove_duplicates(processed_tensor_storages);
    bool success = true;
    for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
        std::string file_path = file_paths_[file_index];
@@ -1276,22 +1309,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
            return true;
        };

-        std::vector<TensorStorage> processed_tensor_storages;
-        for (auto& tensor_storage : tensor_storages) {
+        for (auto& tensor_storage : processed_tensor_storages) {
            if (tensor_storage.file_index != file_index) {
                continue;
            }
-
-            // LOG_DEBUG("%s", name.c_str());
-
-            if (is_unused_tensor(tensor_storage.name)) {
-                continue;
-            }
-
-            preprocess_tensor(tensor_storage, processed_tensor_storages);
-        }
-
-        for (auto& tensor_storage : processed_tensor_storages) {
            // LOG_DEBUG("%s", tensor_storage.name.c_str());

            ggml_tensor* dst_tensor = NULL;
@@ -1437,7 +1458,61 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
    return true;
 }

-int64_t ModelLoader::cal_mem_size(ggml_backend_t backend) {
+bool ModelLoader::save_to_gguf_file(const std::string& file_path, ggml_type type) {
+    auto backend    = ggml_backend_cpu_init();
+    size_t mem_size = 1 * 1024 * 1024;  // for padding
+    mem_size += tensor_storages.size() * ggml_tensor_overhead();
+    mem_size += cal_mem_size(backend, type);
+    LOG_INFO("model tensors mem size: %.2fMB", mem_size / 1024.f / 1024.f);
+    ggml_context* ggml_ctx = ggml_init({mem_size, NULL, false});
+
+    gguf_context* gguf_ctx = gguf_init_empty();
+
+    auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
+        const std::string& name = tensor_storage.name;
+
+        ggml_type tensor_type = tensor_storage.type;
+        if (type != GGML_TYPE_COUNT) {
+            if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
+                tensor_type = GGML_TYPE_F16;
+            } else {
+                tensor_type = type;
+            }
+        }
+
+        ggml_tensor* tensor = ggml_new_tensor(ggml_ctx, tensor_type, tensor_storage.n_dims, tensor_storage.ne);
+        if (tensor == NULL) {
+            LOG_ERROR("ggml_new_tensor failed");
+            return false;
+        }
+        ggml_set_name(tensor, name.c_str());
+
+        // LOG_DEBUG("%s %d %s %d[%d %d %d %d] %d[%d %d %d %d]", name.c_str(),
+        // ggml_nbytes(tensor), ggml_type_name(tensor_type),
+        // tensor_storage.n_dims,
+        // tensor_storage.ne[0], tensor_storage.ne[1], tensor_storage.ne[2], tensor_storage.ne[3],
+        // tensor->n_dims, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+
+        *dst_tensor = tensor;
+
+        gguf_add_tensor(gguf_ctx, tensor);
+
+        return true;
+    };
+
+    bool success = load_tensors(on_new_tensor_cb, backend);
+    ggml_backend_free(backend);
+    LOG_INFO("load tensors done");
+    LOG_INFO("trying to save tensors to %s", file_path.c_str());
+    if (success) {
+        gguf_write_to_file(gguf_ctx, file_path.c_str(), false);
+    }
+    ggml_free(ggml_ctx);
+    gguf_free(gguf_ctx);
+    return success;
+}
+
+int64_t ModelLoader::cal_mem_size(ggml_backend_t backend, ggml_type type) {
    size_t alignment = 128;
    if (backend != NULL) {
        alignment = ggml_backend_get_alignment(backend);
@@ -1452,8 +1527,35 @@ int64_t ModelLoader::cal_mem_size(ggml_backend_t backend) {
    }

    for (auto& tensor_storage : processed_tensor_storages) {
+        ggml_type tensor_type = tensor_storage.type;
+        if (type != GGML_TYPE_COUNT) {
+            if (ggml_is_quantized(type) && tensor_storage.ne[0] % 32 != 0) {
+                tensor_type = GGML_TYPE_F16;
+            } else {
+                tensor_type = type;
+            }
+        }
+        tensor_storage.type = tensor_type;
        mem_size += tensor_storage.nbytes() + alignment;
    }

    return mem_size;
 }
+
+bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
+    ModelLoader model_loader;
+
+    if (!model_loader.init_from_file(input_path)) {
+        LOG_ERROR("init model loader from file failed: '%s'", input_path);
+        return false;
+    }
+
+    if (vae_path != NULL && strlen(vae_path) > 0) {
+        if (!model_loader.init_from_file(vae_path, "vae.")) {
+            LOG_ERROR("init model loader from file failed: '%s'", vae_path);
+            return false;
+        }
+    }
+    bool success = model_loader.save_to_gguf_file(output_path, (ggml_type)output_type);
+    return success;
+}
--- a/model.h
+++ b/model.h
@@ -4,9 +4,9 @@
 #include <functional>
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
-#include <set>

 #include "ggml/ggml-backend.h"
 #include "ggml/ggml.h"
@@ -121,7 +121,8 @@ public:
    bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                      ggml_backend_t backend,
                      std::set<std::string> ignore_tensors = {});
-    int64_t cal_mem_size(ggml_backend_t backend);
+    bool save_to_gguf_file(const std::string& file_path, ggml_type type);
+    int64_t cal_mem_size(ggml_backend_t backend, ggml_type type = GGML_TYPE_COUNT);
    ~ModelLoader() = default;
 };
 #endif  // __MODEL_H__
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -148,7 +148,9 @@ SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                        enum sd_type_t wtype);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);

-SD_API sd_image_t upscale(upscaler_ctx_t*, sd_image_t input_image, uint32_t upscale_factor);
+SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
+
+SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type);

 #ifdef __cplusplus
 }