diff --git a/.gitignore b/.gitignore
index 2823bc9..acc7731 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,5 +8,6 @@ test/
*.bin
*.exe
*.gguf
+*.log
output.png
-models/*
\ No newline at end of file
+models/
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7148431..b119ee6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@ endif()
#option(SD_BUILD_TESTS "sd: build tests" ${SD_STANDALONE})
option(SD_BUILD_EXAMPLES "sd: build examples" ${SD_STANDALONE})
option(SD_CUBLAS "sd: cuda backend" OFF)
-option(SD_FLASH_ATTN "sd: use flash attention for x4 less memory usage" OFF)
+option(SD_FLASH_ATTN "sd: use flash attention for x4 less memory usage" OFF)
option(BUILD_SHARED_LIBS "sd: build shared libs" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON)
@@ -45,14 +45,15 @@ set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
# deps
add_subdirectory(ggml)
+add_subdirectory(thirdparty)
+
set(SD_LIB stable-diffusion)
-add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp)
-target_link_libraries(${SD_LIB} PUBLIC ggml)
-target_include_directories(${SD_LIB} PUBLIC .)
+add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp model.h model.cpp util.h util.cpp)
+target_link_libraries(${SD_LIB} PUBLIC ggml zip)
+target_include_directories(${SD_LIB} PUBLIC . thirdparty)
target_compile_features(${SD_LIB} PUBLIC cxx_std_11)
-add_subdirectory(common)
if (SD_BUILD_EXAMPLES)
add_subdirectory(examples)
diff --git a/README.md b/README.md
index 05966c1..79c27bc 100644
--- a/README.md
+++ b/README.md
@@ -10,13 +10,15 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
- Super lightweight and without external dependencies.
+- SD1.x and SD2.x support
- 16-bit, 32-bit float support
- 4-bit, 5-bit and 8-bit integer quantization support
- Accelerated memory-efficient CPU inference
- Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
- AVX, AVX2 and AVX512 support for x86 architectures
-- SD1.x and SD2.x support
- Full CUDA backend for GPU acceleration, for now just for float16 and float32 models. There are some issues with quantized models and CUDA; it will be fixed in the future.
+- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models.
+ - No need to convert to `.ggml` or `.gguf` anymore!
- Flash Attention for memory usage optimization (only cpu for now).
- Original `txt2img` and `img2img` mode
- Negative prompt
@@ -68,7 +70,7 @@ git submodule init
git submodule update
```
-### Convert weights
+### Download weights
- download original weights(.ckpt or .safetensors). For example
- Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
@@ -81,22 +83,6 @@ git submodule update
# curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-nonema-pruned.safetensors
```
-- convert weights to gguf model format
-
- ```shell
- ./bin/convert sd-v1-4.ckpt -t f16
- ```
-
-### Quantization
-
-You can specify the output model format using the `--type` or `-t` parameter
-
-- `f16` for 16-bit floating-point
-- `f32` for 32-bit floating-point
-- `q8_0` for 8-bit integer quantization
-- `q5_0` or `q5_1` for 5-bit integer quantization
-- `q4_0` or `q4_1` for 4-bit integer quantization
-
### Build
#### Build from scratch
@@ -144,9 +130,11 @@ arguments:
-t, --threads N number of threads to use during computation (default: -1).
If threads <= 0, then threads will be set to the number of CPU physical cores
-m, --model [MODEL] path to model
- --lora-model-dir [DIR] lora model directory
+ --vae [VAE] path to vae
+ --type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
+ If not specified, the default is the type of the weight file. --lora-model-dir [DIR] lora model directory
-i, --init-img [IMAGE] path to the input image, required by img2img
- -o, --output OUTPUT path to write result image to (default: .\output.png)
+ -o, --output OUTPUT path to write result image to (default: ./output.png)
-p, --prompt [PROMPT] the prompt to render
-n, --negative-prompt PROMPT the negative prompt (default: "")
--cfg-scale SCALE unconditional guidance scale: (default: 7.0)
@@ -164,10 +152,21 @@ arguments:
-v, --verbose print extra info
```
+#### Quantization
+
+You can specify the model weight type using the `--type` parameter. The weights are automatically converted when loading the model.
+
+- `f16` for 16-bit floating-point
+- `f32` for 32-bit floating-point
+- `q8_0` for 8-bit integer quantization
+- `q5_0` or `q5_1` for 5-bit integer quantization
+- `q4_0` or `q4_1` for 4-bit integer quantization
+
#### txt2img example
-```
-./bin/sd -m ../sd-v1-4-f16.gguf -p "a lovely cat"
+```sh
+./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
+# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
```
Using formats of different precisions will yield results of varying quality.
@@ -182,7 +181,7 @@ Using formats of different precisions will yield results of varying quality.
```
-./bin/sd --mode img2img -m ../models/sd-v1-4-f16.gguf -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+./bin/sd --mode img2img -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
```
@@ -191,13 +190,6 @@ Using formats of different precisions will yield results of varying quality.
#### with LoRA
-- convert lora weights to gguf model format
-
- ```shell
- bin/convert [lora path] -t f16
- # For example, bin/convert marblesh.safetensors -t f16
- ```
-
- You can specify the directory where the lora weights are stored via `--lora-model-dir`. If not specified, the default is the current working directory.
- LoRA is specified via prompt, just like [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#lora).
@@ -205,10 +197,10 @@ Using formats of different precisions will yield results of varying quality.
Here's a simple example:
```
-./bin/sd -m ../models/v1-5-pruned-emaonly-f16.gguf -p "a lovely cat" --lora-model-dir ../models
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --lora-model-dir ../models
```
-`../models/marblesh.gguf` will be applied to the model
+`../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
#### LCM/LCM-LoRA
@@ -219,7 +211,7 @@ Here's a simple example:
Here's a simple example:
```
-./bin/sd -m ../models/v1-5-pruned-emaonly-f16.gguf -p "a lovely cat" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat" --steps 4 --lora-model-dir ../models -v --cfg-scale 1
```
| without LCM-LoRA (--cfg-scale 7) | with LCM-LoRA (--cfg-scale 1) |
@@ -240,14 +232,13 @@ docker build -t sd .
```shell
docker run -v /path/to/models:/models -v /path/to/output/:/output sd [args...]
# For example
-# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4-f16.gguf -p "a lovely cat" -v -o /output/output.png
+# docker run -v ./models:/models -v ./build:/output sd -m /models/sd-v1-4.ckpt -p "a lovely cat" -v -o /output/output.png
```
-## Memory/Disk Requirements
+## Memory Requirements
| precision | f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
| ---- | ---- |---- |---- |---- |---- |---- |---- |
-| **Disk** | 2.7G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
| **Memory** (txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
| **Memory** (txt2img - 512 x 512) *with Flash Attention* | ~2.4G | ~1.9G | ~1.6G | ~1.5G | ~1.5G | ~1.5G | ~1.5G |
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
deleted file mode 100644
index 715e3b5..0000000
--- a/common/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-set(TARGET common)
-
-# json.hpp library from: https://github.com/nlohmann/json
-
-add_library(${TARGET} OBJECT common.cpp common.h stb_image.h stb_image_write.h json.hpp)
-
-target_include_directories(${TARGET} PUBLIC .)
-target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-
-# ZIP Library from: https://github.com/kuba--/zip
-
-set(Z_TARGET zip)
-add_library(${Z_TARGET} OBJECT zip.c zip.h miniz.h)
-target_include_directories(${Z_TARGET} PUBLIC .)
\ No newline at end of file
diff --git a/common/common.cpp b/common/common.cpp
deleted file mode 100644
index 3128017..0000000
--- a/common/common.cpp
+++ /dev/null
@@ -1,391 +0,0 @@
-#include "common.h"
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-#if defined(__APPLE__) && defined(__MACH__)
-#include
-#include
-#endif
-
-#if !defined(_WIN32)
-#include
-#include
-#endif
-
-// get_num_physical_cores is copy from
-// https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp
-// LICENSE: https://github.com/ggerganov/llama.cpp/blob/master/LICENSE
-int32_t get_num_physical_cores() {
-#ifdef __linux__
- // enumerate the set of thread siblings, num entries is num cores
- std::unordered_set siblings;
- for (uint32_t cpu = 0; cpu < UINT32_MAX; ++cpu) {
- std::ifstream thread_siblings("/sys/devices/system/cpu" + std::to_string(cpu) + "/topology/thread_siblings");
- if (!thread_siblings.is_open()) {
- break; // no more cpus
- }
- std::string line;
- if (std::getline(thread_siblings, line)) {
- siblings.insert(line);
- }
- }
- if (siblings.size() > 0) {
- return static_cast(siblings.size());
- }
-#elif defined(__APPLE__) && defined(__MACH__)
- int32_t num_physical_cores;
- size_t len = sizeof(num_physical_cores);
- int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
- if (result == 0) {
- return num_physical_cores;
- }
- result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
- if (result == 0) {
- return num_physical_cores;
- }
-#elif defined(_WIN32)
- // TODO: Implement
-#endif
- unsigned int n_threads = std::thread::hardware_concurrency();
- return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
-}
-
-const char* rng_type_to_str[] = {
- "std_default",
- "cuda",
-};
-
-// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
-const char* sample_method_str[] = {
- "euler_a",
- "euler",
- "heun",
- "dpm2",
- "dpm++2s_a",
- "dpm++2m",
- "dpm++2mv2",
- "lcm",
-};
-
-// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char* schedule_str[] = {
- "default",
- "discrete",
- "karras"};
-
-const char* modes_str[] = {
- "txt2img",
- "img2img"};
-
-void print_params(SDParams params) {
- printf("Option: \n");
- printf(" n_threads: %d\n", params.n_threads);
- printf(" mode: %s\n", modes_str[params.mode]);
- printf(" model_path: %s\n", params.model_path.c_str());
- printf(" output_path: %s\n", params.output_path.c_str());
- printf(" init_img: %s\n", params.input_path.c_str());
- printf(" prompt: %s\n", params.prompt.c_str());
- printf(" negative_prompt: %s\n", params.negative_prompt.c_str());
- printf(" cfg_scale: %.2f\n", params.cfg_scale);
- printf(" width: %d\n", params.width);
- printf(" height: %d\n", params.height);
- printf(" sample_method: %s\n", sample_method_str[params.sample_method]);
- printf(" schedule: %s\n", schedule_str[params.schedule]);
- printf(" sample_steps: %d\n", params.sample_steps);
- printf(" strength: %.2f\n", params.strength);
- printf(" rng: %s\n", rng_type_to_str[params.rng_type]);
- printf(" seed: %ld\n", params.seed);
- printf(" batch_count: %d\n", params.batch_count);
-}
-
-void print_usage(int argc, const char* argv[]) {
- printf("usage: %s [arguments]\n", argv[0]);
- printf("\n");
- printf("arguments:\n");
- printf(" -h, --help show this help message and exit\n");
- printf(" -M, --mode [txt2img or img2img] generation mode (default: txt2img)\n");
- printf(" -t, --threads N number of threads to use during computation (default: -1).\n");
- printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
- printf(" -m, --model [MODEL] path to model\n");
- printf(" --lora-model-dir [DIR] lora model directory\n");
- printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n");
- printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
- printf(" -p, --prompt [PROMPT] the prompt to render\n");
- printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
- printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
- printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
- printf(" 1.0 corresponds to full destruction of information in init image\n");
- printf(" -H, --height H image height, in pixel space (default: 512)\n");
- printf(" -W, --width W image width, in pixel space (default: 512)\n");
- printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
- printf(" sampling method (default: \"euler_a\")\n");
- printf(" --steps STEPS number of sample steps (default: 20)\n");
- printf(" --rng {std_default, cuda} RNG (default: cuda)\n");
- printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
- printf(" -b, --batch-count COUNT number of images to generate.\n");
- printf(" --schedule {discrete, karras} Denoiser sigma schedule (default: discrete)\n");
- printf(" -v, --verbose print extra info\n");
-}
-
-void parse_args(int argc, const char** argv, SDParams& params) {
- bool invalid_arg = false;
- std::string arg;
- for (int i = 1; i < argc; i++) {
- arg = argv[i];
-
- if (arg == "-t" || arg == "--threads") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.n_threads = std::stoi(argv[i]);
- } else if (arg == "-M" || arg == "--mode") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- const char* mode_selected = argv[i];
- int mode_found = -1;
- for (int d = 0; d < MODE_COUNT; d++) {
- if (!strcmp(mode_selected, modes_str[d])) {
- mode_found = d;
- }
- }
- if (mode_found == -1) {
- fprintf(stderr, "error: invalid mode %s, must be one of [txt2img, img2img]\n",
- mode_selected);
- exit(1);
- }
- params.mode = (sd_mode)mode_found;
- } else if (arg == "-m" || arg == "--model") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.model_path = argv[i];
- } else if (arg == "--lora-model-dir") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.lora_model_dir = argv[i];
- } else if (arg == "-i" || arg == "--init-img") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.input_path = argv[i];
- } else if (arg == "-o" || arg == "--output") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.output_path = argv[i];
- } else if (arg == "-p" || arg == "--prompt") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.prompt = argv[i];
- } else if (arg == "-n" || arg == "--negative-prompt") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.negative_prompt = argv[i];
- } else if (arg == "--cfg-scale") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.cfg_scale = std::stof(argv[i]);
- } else if (arg == "--strength") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.strength = std::stof(argv[i]);
- } else if (arg == "-H" || arg == "--height") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.height = std::stoi(argv[i]);
- } else if (arg == "-W" || arg == "--width") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.width = std::stoi(argv[i]);
- } else if (arg == "--steps") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.sample_steps = std::stoi(argv[i]);
- } else if (arg == "-b" || arg == "--batch-count") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.batch_count = std::stoi(argv[i]);
- } else if (arg == "--rng") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- std::string rng_type_str = argv[i];
- if (rng_type_str == "std_default") {
- params.rng_type = STD_DEFAULT_RNG;
- } else if (rng_type_str == "cuda") {
- params.rng_type = CUDA_RNG;
- } else {
- invalid_arg = true;
- break;
- }
- } else if (arg == "--schedule") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- const char* schedule_selected = argv[i];
- int schedule_found = -1;
- for (int d = 0; d < N_SCHEDULES; d++) {
- if (!strcmp(schedule_selected, schedule_str[d])) {
- schedule_found = d;
- }
- }
- if (schedule_found == -1) {
- invalid_arg = true;
- break;
- }
- params.schedule = (Schedule)schedule_found;
- } else if (arg == "-s" || arg == "--seed") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- params.seed = std::stoll(argv[i]);
- } else if (arg == "--sampling-method") {
- if (++i >= argc) {
- invalid_arg = true;
- break;
- }
- const char* sample_method_selected = argv[i];
- int sample_method_found = -1;
- for (int m = 0; m < N_SAMPLE_METHODS; m++) {
- if (!strcmp(sample_method_selected, sample_method_str[m])) {
- sample_method_found = m;
- }
- }
- if (sample_method_found == -1) {
- invalid_arg = true;
- break;
- }
- params.sample_method = (SampleMethod)sample_method_found;
- } else if (arg == "-h" || arg == "--help") {
- print_usage(argc, argv);
- exit(0);
- } else if (arg == "-v" || arg == "--verbose") {
- params.verbose = true;
- } else {
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
- print_usage(argc, argv);
- exit(1);
- }
- }
- if (invalid_arg) {
- fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
- print_usage(argc, argv);
- exit(1);
- }
- if (params.n_threads <= 0) {
- params.n_threads = get_num_physical_cores();
- }
-
- if (params.prompt.length() == 0) {
- fprintf(stderr, "error: the following arguments are required: prompt\n");
- print_usage(argc, argv);
- exit(1);
- }
-
- if (params.model_path.length() == 0) {
- fprintf(stderr, "error: the following arguments are required: model_path\n");
- print_usage(argc, argv);
- exit(1);
- }
-
- if (params.mode == IMG2IMG && params.input_path.length() == 0) {
- fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
- print_usage(argc, argv);
- exit(1);
- }
-
- if (params.output_path.length() == 0) {
- fprintf(stderr, "error: the following arguments are required: output_path\n");
- print_usage(argc, argv);
- exit(1);
- }
-
- if (params.width <= 0 || params.width % 64 != 0) {
- fprintf(stderr, "error: the width must be a multiple of 64\n");
- exit(1);
- }
-
- if (params.height <= 0 || params.height % 64 != 0) {
- fprintf(stderr, "error: the height must be a multiple of 64\n");
- exit(1);
- }
-
- if (params.sample_steps <= 0) {
- fprintf(stderr, "error: the sample_steps must be greater than 0\n");
- exit(1);
- }
-
- if (params.strength < 0.f || params.strength > 1.f) {
- fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
- exit(1);
- }
-
- if (params.seed < 0) {
- srand((int)time(NULL));
- params.seed = rand();
- }
-}
-
-std::string basename(const std::string& path) {
- size_t pos = path.find_last_of('/');
- if (pos != std::string::npos) {
- return path.substr(pos + 1);
- }
- pos = path.find_last_of('\\');
- if (pos != std::string::npos) {
- return path.substr(pos + 1);
- }
- return path;
-}
-
-std::string get_image_params(SDParams params, int seed) {
- std::string parameter_string = params.prompt + "\n";
- if (params.negative_prompt.size() != 0) {
- parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
- }
- parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
- parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
- parameter_string += "Seed: " + std::to_string(seed) + ", ";
- parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
- parameter_string += "Model: " + basename(params.model_path) + ", ";
- parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
- parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
- if (params.schedule == KARRAS) {
- parameter_string += " karras";
- }
- parameter_string += ", ";
- parameter_string += "Version: stable-diffusion.cpp";
- return parameter_string;
-}
diff --git a/common/common.h b/common/common.h
deleted file mode 100644
index abcf1e2..0000000
--- a/common/common.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#pragma once
-
-#include
-#include "stable-diffusion.h"
-
-enum sd_mode {
- TXT2IMG,
- IMG2IMG,
- MODE_COUNT
-};
-
-struct SDParams {
- int n_threads = -1;
- sd_mode mode = TXT2IMG;
-
- std::string model_path;
- std::string lora_model_dir;
- std::string output_path = "output.png";
- std::string input_path;
-
- std::string prompt;
- std::string negative_prompt;
- float cfg_scale = 7.0f;
- int width = 512;
- int height = 512;
- int batch_count = 1;
-
- SampleMethod sample_method = EULER_A;
- Schedule schedule = DEFAULT;
- int sample_steps = 20;
- float strength = 0.75f;
- RNGType rng_type = CUDA_RNG;
- int64_t seed = 42;
- bool verbose = false;
-};
-
-void print_params(SDParams params);
-
-void print_usage(int argc, const char* argv[]);
-
-void parse_args(int argc, const char** argv, SDParams& params);
-
-std::string get_image_params(SDParams params, int seed);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 658e3a0..81053f9 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,3 @@
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_subdirectory(cli)
-add_subdirectory(convert)
\ No newline at end of file
+add_subdirectory(cli)
\ No newline at end of file
diff --git a/examples/cli/CMakeLists.txt b/examples/cli/CMakeLists.txt
index 06ea964..4861bd3 100644
--- a/examples/cli/CMakeLists.txt
+++ b/examples/cli/CMakeLists.txt
@@ -2,5 +2,5 @@ set(TARGET sd)
add_executable(${TARGET} main.cpp)
install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE stable-diffusion common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE stable-diffusion ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PUBLIC cxx_std_11)
\ No newline at end of file
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index d5e69a6..8d5196a 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -1,8 +1,9 @@
#include
#include
#include
-#include "common.h"
+#include "ggml/ggml.h"
#include "stable-diffusion.h"
+#include "util.h"
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
@@ -11,6 +12,405 @@
#define STB_IMAGE_WRITE_STATIC
#include "stb_image_write.h"
+#include
+#include
+#include
+#include
+
+const char* rng_type_to_str[] = {
+ "std_default",
+ "cuda",
+};
+
+// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
+const char* sample_method_str[] = {
+ "euler_a",
+ "euler",
+ "heun",
+ "dpm2",
+ "dpm++2s_a",
+ "dpm++2m",
+ "dpm++2mv2",
+ "lcm",
+};
+
+// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
+const char* schedule_str[] = {
+ "default",
+ "discrete",
+ "karras",
+};
+
+const char* modes_str[] = {
+ "txt2img",
+ "img2img",
+};
+
+enum SDMode {
+ TXT2IMG,
+ IMG2IMG,
+ MODE_COUNT
+};
+
+struct SDParams {
+ int n_threads = -1;
+ SDMode mode = TXT2IMG;
+
+ std::string model_path;
+ std::string vae_path;
+ ggml_type wtype = GGML_TYPE_COUNT;
+ std::string lora_model_dir;
+ std::string output_path = "output.png";
+ std::string input_path;
+
+ std::string prompt;
+ std::string negative_prompt;
+ float cfg_scale = 7.0f;
+ int width = 512;
+ int height = 512;
+ int batch_count = 1;
+
+ SampleMethod sample_method = EULER_A;
+ Schedule schedule = DEFAULT;
+ int sample_steps = 20;
+ float strength = 0.75f;
+ RNGType rng_type = CUDA_RNG;
+ int64_t seed = 42;
+ bool verbose = false;
+};
+
+void print_params(SDParams params) {
+ printf("Option: \n");
+ printf(" n_threads: %d\n", params.n_threads);
+ printf(" mode: %s\n", modes_str[params.mode]);
+ printf(" model_path: %s\n", params.model_path.c_str());
+ printf(" wtype: %s\n", params.wtype < GGML_TYPE_COUNT ? ggml_type_name(params.wtype) : "unspecified");
+ printf(" vae_path: %s\n", params.vae_path.c_str());
+ printf(" output_path: %s\n", params.output_path.c_str());
+ printf(" init_img: %s\n", params.input_path.c_str());
+ printf(" prompt: %s\n", params.prompt.c_str());
+ printf(" negative_prompt: %s\n", params.negative_prompt.c_str());
+ printf(" cfg_scale: %.2f\n", params.cfg_scale);
+ printf(" width: %d\n", params.width);
+ printf(" height: %d\n", params.height);
+ printf(" sample_method: %s\n", sample_method_str[params.sample_method]);
+ printf(" schedule: %s\n", schedule_str[params.schedule]);
+ printf(" sample_steps: %d\n", params.sample_steps);
+ printf(" strength(img2img): %.2f\n", params.strength);
+ printf(" rng: %s\n", rng_type_to_str[params.rng_type]);
+ printf(" seed: %ld\n", params.seed);
+ printf(" batch_count: %d\n", params.batch_count);
+}
+
+void print_usage(int argc, const char* argv[]) {
+ printf("usage: %s [arguments]\n", argv[0]);
+ printf("\n");
+ printf("arguments:\n");
+ printf(" -h, --help show this help message and exit\n");
+ printf(" -M, --mode [txt2img or img2img] generation mode (default: txt2img)\n");
+ printf(" -t, --threads N number of threads to use during computation (default: -1).\n");
+ printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n");
+ printf(" -m, --model [MODEL] path to model\n");
+ printf(" --vae [VAE] path to vae\n");
+ printf(" --type [TYPE] weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
+ printf(" If not specified, the default is the type of the weight file.");
+ printf(" --lora-model-dir [DIR] lora model directory\n");
+ printf(" -i, --init-img [IMAGE] path to the input image, required by img2img\n");
+ printf(" -o, --output OUTPUT path to write result image to (default: ./output.png)\n");
+ printf(" -p, --prompt [PROMPT] the prompt to render\n");
+ printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n");
+ printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n");
+ printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n");
+ printf(" 1.0 corresponds to full destruction of information in init image\n");
+ printf(" -H, --height H image height, in pixel space (default: 512)\n");
+ printf(" -W, --width W image width, in pixel space (default: 512)\n");
+ printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, lcm}\n");
+ printf(" sampling method (default: \"euler_a\")\n");
+ printf(" --steps STEPS number of sample steps (default: 20)\n");
+ printf(" --rng {std_default, cuda} RNG (default: cuda)\n");
+ printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
+ printf(" -b, --batch-count COUNT number of images to generate.\n");
+ printf(" --schedule {discrete, karras} Denoiser sigma schedule (default: discrete)\n");
+ printf(" -v, --verbose print extra info\n");
+}
+
+void parse_args(int argc, const char** argv, SDParams& params) {
+ bool invalid_arg = false;
+ std::string arg;
+ for (int i = 1; i < argc; i++) {
+ arg = argv[i];
+
+ if (arg == "-t" || arg == "--threads") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.n_threads = std::stoi(argv[i]);
+ } else if (arg == "-M" || arg == "--mode") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ const char* mode_selected = argv[i];
+ int mode_found = -1;
+ for (int d = 0; d < MODE_COUNT; d++) {
+ if (!strcmp(mode_selected, modes_str[d])) {
+ mode_found = d;
+ }
+ }
+ if (mode_found == -1) {
+ fprintf(stderr, "error: invalid mode %s, must be one of [txt2img, img2img]\n",
+ mode_selected);
+ exit(1);
+ }
+ params.mode = (SDMode)mode_found;
+ } else if (arg == "-m" || arg == "--model") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.model_path = argv[i];
+ } else if (arg == "--vae") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.vae_path = argv[i];
+ } else if (arg == "--type") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ std::string type = argv[i];
+ if (type == "f32") {
+ params.wtype = GGML_TYPE_F32;
+ } else if (type == "f16") {
+ params.wtype = GGML_TYPE_F16;
+ } else if (type == "q4_0") {
+ params.wtype = GGML_TYPE_Q4_0;
+ } else if (type == "q4_1") {
+ params.wtype = GGML_TYPE_Q4_1;
+ } else if (type == "q5_0") {
+ params.wtype = GGML_TYPE_Q5_0;
+ } else if (type == "q5_1") {
+ params.wtype = GGML_TYPE_Q5_1;
+ } else if (type == "q8_0") {
+ params.wtype = GGML_TYPE_Q8_0;
+ } else {
+ fprintf(stderr, "error: invalid weight format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
+ type.c_str());
+ exit(1);
+ }
+ } else if (arg == "--lora-model-dir") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.lora_model_dir = argv[i];
+ } else if (arg == "-i" || arg == "--init-img") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.input_path = argv[i];
+ } else if (arg == "-o" || arg == "--output") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.output_path = argv[i];
+ } else if (arg == "-p" || arg == "--prompt") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.prompt = argv[i];
+ } else if (arg == "-n" || arg == "--negative-prompt") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.negative_prompt = argv[i];
+ } else if (arg == "--cfg-scale") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.cfg_scale = std::stof(argv[i]);
+ } else if (arg == "--strength") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.strength = std::stof(argv[i]);
+ } else if (arg == "-H" || arg == "--height") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.height = std::stoi(argv[i]);
+ } else if (arg == "-W" || arg == "--width") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.width = std::stoi(argv[i]);
+ } else if (arg == "--steps") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.sample_steps = std::stoi(argv[i]);
+ } else if (arg == "-b" || arg == "--batch-count") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.batch_count = std::stoi(argv[i]);
+ } else if (arg == "--rng") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ std::string rng_type_str = argv[i];
+ if (rng_type_str == "std_default") {
+ params.rng_type = STD_DEFAULT_RNG;
+ } else if (rng_type_str == "cuda") {
+ params.rng_type = CUDA_RNG;
+ } else {
+ invalid_arg = true;
+ break;
+ }
+ } else if (arg == "--schedule") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ const char* schedule_selected = argv[i];
+ int schedule_found = -1;
+ for (int d = 0; d < N_SCHEDULES; d++) {
+ if (!strcmp(schedule_selected, schedule_str[d])) {
+ schedule_found = d;
+ }
+ }
+ if (schedule_found == -1) {
+ invalid_arg = true;
+ break;
+ }
+ params.schedule = (Schedule)schedule_found;
+ } else if (arg == "-s" || arg == "--seed") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.seed = std::stoll(argv[i]);
+ } else if (arg == "--sampling-method") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ const char* sample_method_selected = argv[i];
+ int sample_method_found = -1;
+ for (int m = 0; m < N_SAMPLE_METHODS; m++) {
+ if (!strcmp(sample_method_selected, sample_method_str[m])) {
+ sample_method_found = m;
+ }
+ }
+ if (sample_method_found == -1) {
+ invalid_arg = true;
+ break;
+ }
+ params.sample_method = (SampleMethod)sample_method_found;
+ } else if (arg == "-h" || arg == "--help") {
+ print_usage(argc, argv);
+ exit(0);
+ } else if (arg == "-v" || arg == "--verbose") {
+ params.verbose = true;
+ } else {
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+ print_usage(argc, argv);
+ exit(1);
+ }
+ }
+ if (invalid_arg) {
+ fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+ print_usage(argc, argv);
+ exit(1);
+ }
+ if (params.n_threads <= 0) {
+ params.n_threads = get_num_physical_cores();
+ }
+
+ if (params.prompt.length() == 0) {
+ fprintf(stderr, "error: the following arguments are required: prompt\n");
+ print_usage(argc, argv);
+ exit(1);
+ }
+
+ if (params.model_path.length() == 0) {
+ fprintf(stderr, "error: the following arguments are required: model_path\n");
+ print_usage(argc, argv);
+ exit(1);
+ }
+
+ if (params.mode == IMG2IMG && params.input_path.length() == 0) {
+ fprintf(stderr, "error: when using the img2img mode, the following arguments are required: init-img\n");
+ print_usage(argc, argv);
+ exit(1);
+ }
+
+ if (params.output_path.length() == 0) {
+ fprintf(stderr, "error: the following arguments are required: output_path\n");
+ print_usage(argc, argv);
+ exit(1);
+ }
+
+ if (params.width <= 0 || params.width % 64 != 0) {
+ fprintf(stderr, "error: the width must be a multiple of 64\n");
+ exit(1);
+ }
+
+ if (params.height <= 0 || params.height % 64 != 0) {
+ fprintf(stderr, "error: the height must be a multiple of 64\n");
+ exit(1);
+ }
+
+ if (params.sample_steps <= 0) {
+ fprintf(stderr, "error: the sample_steps must be greater than 0\n");
+ exit(1);
+ }
+
+ if (params.strength < 0.f || params.strength > 1.f) {
+ fprintf(stderr, "error: can only work with strength in [0.0, 1.0]\n");
+ exit(1);
+ }
+
+ if (params.seed < 0) {
+ srand((int)time(NULL));
+ params.seed = rand();
+ }
+}
+
+std::string get_image_params(SDParams params, int64_t seed) {
+ std::string parameter_string = params.prompt + "\n";
+ if (params.negative_prompt.size() != 0) {
+ parameter_string += "Negative prompt: " + params.negative_prompt + "\n";
+ }
+ parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
+ parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
+ parameter_string += "Seed: " + std::to_string(seed) + ", ";
+ parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
+ parameter_string += "Model: " + basename(params.model_path) + ", ";
+ parameter_string += "RNG: " + std::string(rng_type_to_str[params.rng_type]) + ", ";
+ parameter_string += "Sampler: " + std::string(sample_method_str[params.sample_method]);
+ if (params.schedule == KARRAS) {
+ parameter_string += " karras";
+ }
+ parameter_string += ", ";
+ parameter_string += "Version: stable-diffusion.cpp";
+ return parameter_string;
+}
+
int main(int argc, const char* argv[]) {
SDParams params;
parse_args(argc, argv, params);
@@ -50,7 +450,7 @@ int main(int argc, const char* argv[]) {
}
StableDiffusion sd(params.n_threads, vae_decode_only, true, params.lora_model_dir, params.rng_type);
- if (!sd.load_from_file(params.model_path, params.schedule)) {
+ if (!sd.load_from_file(params.model_path, params.vae_path, params.wtype, params.schedule)) {
return 1;
}
@@ -79,7 +479,7 @@ int main(int argc, const char* argv[]) {
}
if (results.size() == 0 || results.size() != params.batch_count) {
- fprintf(stderr, "generate failed\n");
+ LOG_ERROR("generate failed");
return 1;
}
@@ -88,7 +488,7 @@ int main(int argc, const char* argv[]) {
for (int i = 0; i < params.batch_count; i++) {
std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ".png" : dummy_name + ".png";
stbi_write_png(final_image_path.c_str(), params.width, params.height, 3, results[i], 0, get_image_params(params, params.seed + i).c_str());
- printf("save result image to '%s'\n", final_image_path.c_str());
+ LOG_INFO("save result image to '%s'", final_image_path.c_str());
}
return 0;
diff --git a/examples/convert/CMakeLists.txt b/examples/convert/CMakeLists.txt
deleted file mode 100644
index e71d107..0000000
--- a/examples/convert/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET convert)
-
-add_executable(${TARGET} convert.cpp vocab.hpp)
-target_link_libraries(${TARGET} PRIVATE ggml zip ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
diff --git a/examples/convert/README.md b/examples/convert/README.md
deleted file mode 100644
index 4645916..0000000
--- a/examples/convert/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# Model Convert
-
-## Usage
-```
-usage: convert.exe [MODEL_PATH] --type [OUT_TYPE] [arguments]
-Model supported for conversion: .safetensors models or .ckpt checkpoints models
-
-arguments:
- -h, --help show this help message and exit
- -o, --out [FILENAME] path or name to converted model
- --vocab [FILENAME] path to custom vocab.json (usually unnecessary)
- -v, --verbose print processing info - dev info
- -l, --lora force read the model as a LoRA
- --vae [FILENAME] merge a custom VAE
- -t, --type [OUT_TYPE] output format (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
-```
diff --git a/examples/convert/convert.cpp b/examples/convert/convert.cpp
deleted file mode 100644
index c0271c8..0000000
--- a/examples/convert/convert.cpp
+++ /dev/null
@@ -1,1565 +0,0 @@
-#include "ggml/ggml.h"
-
-// third-party libraries
-#include "json.hpp"
-#include "zip.h"
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-/*
- References:
-
- Pickle Format: https://github.com/python/cpython/blob/main/Lib/pickle.py
- Safetensors: https://huggingface.co/docs/safetensors/index
- bfloat16 conversion: https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
- Vocab source: https://huggingface.co/openai/clip-vit-large-patch14/blob/main/vocab.json
- diffusers to original conversion: https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/diffusers_convert.py
-
-*/
-
-std::string format(const char* fmt, ...) {
- char result[100];
- va_list args;
- va_start(args, fmt);
- vsnprintf(result, 100, fmt, args);
- va_end(args);
- return std::string(result);
-}
-
-float bfloat16_to_fp32(uint16_t bfloat16) {
- uint32_t val_bits = (static_cast(bfloat16) << 16);
- return *reinterpret_cast(&val_bits);
-}
-
-#include "vocab.hpp"
-
-using json = nlohmann::json;
-
-#define MAX_STRING_BUFFER 95
-#define UNUSED_MODEL_TENSORS 20
-#define TIMESTEPS 1000
-
-const char* unused_tensors[UNUSED_MODEL_TENSORS] = {
- "betas",
- "alphas_cumprod_prev",
- "sqrt_alphas_cumprod",
- "sqrt_one_minus_alphas_cumprod",
- "log_one_minus_alphas_cumprod",
- "sqrt_recip_alphas_cumprod",
- "sqrt_recipm1_alphas_cumprod",
- "posterior_variance",
- "posterior_log_variance_clipped",
- "posterior_mean_coef1",
- "posterior_mean_coef2",
- "cond_stage_model.transformer.text_model.embeddings.position_ids",
- "cond_stage_model.model.logit_scale",
- "cond_stage_model.model.text_projection",
- "model.diffusion_model.time_embedding.cond_proj.weight",
- "model_ema.decay",
- "model_ema.num_updates",
- "model_ema.diffusion_model",
- "control_model",
- "embedding_manager"};
-
-std::string kqv_self[6] = {
- "self_attn.q_proj.weight",
- "self_attn.k_proj.weight",
- "self_attn.v_proj.weight",
-
- "self_attn.q_proj.bias",
- "self_attn.k_proj.bias",
- "self_attn.v_proj.bias"};
-
-#ifdef _WIN32 // code for windows
-#include
-
-bool file_exists(const std::string& filename) {
- DWORD attributes = GetFileAttributesA(filename.c_str());
- return (attributes != INVALID_FILE_ATTRIBUTES && !(attributes & FILE_ATTRIBUTE_DIRECTORY));
-}
-
-bool is_directory(const std::string& path) {
- DWORD attributes = GetFileAttributesA(path.c_str());
- return (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY));
-}
-
-#else // code for linux
-#include
-#include
-
-bool file_exists(const std::string& filename) {
- struct stat buffer;
- return (stat(filename.c_str(), &buffer) == 0 && S_ISREG(buffer.st_mode));
-}
-
-bool is_directory(const std::string& path) {
- struct stat buffer;
- return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
-}
-
-#endif
-
-enum SDVersion {
- VERSION_1_x,
- VERSION_2_x,
- VERSION_XL
-};
-
-enum ReadPhase {
- READ_NAME,
- READ_DATA,
- CHECK_SIZE,
- READ_DIMENS
-};
-
-enum SDLoraType {
- LORA_NONE,
- LORA_REGULAR,
- LORA_DIFFUSERS,
- LORA_TRANSFORMERS
-};
-
-enum DataPointerType {
- CHECKPOINT,
- SAFETENSOR
-};
-
-enum TensorTarget {
- NONE,
- CLIP,
- UNET,
- VAE,
-};
-
-struct ConvertParams {
- ggml_type out_type = GGML_TYPE_F32;
- SDVersion version = VERSION_1_x;
- std::string model_name = "";
- std::string model_path = "";
- std::string custom_vae_path = "";
-
- std::string output_path = "";
- std::string vocab_path = "";
-
- // file pointers
- std::vector pkl_fp;
- std::vector sf_fp;
-
- bool from_folder = false;
- bool merge_custom_vae = false;
- bool verbose = false;
- bool generate_alphas_cumprod = false;
-
- // LoRA
- bool lora = false;
- std::map lora_alphas;
- std::set alpha_keys;
- std::vector alpha_values;
- SDLoraType lora_type = LORA_NONE;
-
- // VAE
- bool vae = false;
-};
-
-struct Tensor {
- std::string name;
- size_t data_offset = 0;
- ggml_type dtype = GGML_TYPE_F32;
- size_t data_size = 0;
- int32_t shape[4] = {1, 1, 1, 1};
- int32_t n_dims = 0;
- ReadPhase t_phase = READ_NAME;
- int32_t num_elements = 0;
- bool is_view = false;
- void* data = NULL;
- int32_t ptr_idx = -1;
- DataPointerType ptr_type = CHECKPOINT;
- TensorTarget target = NONE;
-
- Tensor() {}
-
- Tensor(std::string name, ggml_type type, size_t data_size, const int32_t* ne, int n_dims, int32_t num_elements, bool is_view)
- : name(name), dtype(type), data_size(data_size), n_dims(n_dims), num_elements(num_elements), is_view(is_view) {
- for (int i = 0; i < n_dims; i++) {
- shape[i] = ne[i];
- }
- }
-
- bool detect_target(ConvertParams params) {
- if (target != NONE) {
- return false;
- }
- if (name.find("first_stage_model.") == 0 || params.vae) {
- target = VAE;
- } else if (name.find("model.diffusion_model.") == 0 ||
- params.lora && name.find(".unet.") != std::string::npos) {
- target = UNET;
- } else if (name.find("cond_stage_model.") == 0 ||
- name.find("conditioner.") == 0 ||
- params.lora && name.find("text.model.") != std::string::npos) {
- target = CLIP;
- }
- return true;
- }
-
- void dump() {
- printf("Tensor: %30s | n_dim: %i | [%i, %i, %i, %i] | %s \n", name.c_str(), n_dims, shape[0], shape[1], shape[2], shape[3], ggml_type_name(dtype));
- }
-
- int64_t* inverse_shape() {
- int64_t* v = new int64_t[4];
- for (int i = 0; i < 4; i++) {
- v[i] = (i < n_dims) ? shape[n_dims - 1 - i] : 1;
- }
- return v;
- }
-};
-
-typedef std::unordered_map TensorMap;
-
-/*
-
- UTILS FUNTIONS
-
-*/
-
-void sd_fread(void* ptr, size_t size, size_t count, FILE* stream) {
- size_t ret = std::fread(ptr, size, count, stream);
- if (ret != count) {
- printf("Error: read from file failed");
- exit(1);
- }
-}
-
-int64_t read_long(uint8_t* buffer) {
- // little endian
- int64_t value = 0;
- value |= static_cast(buffer[7]) << 56;
- value |= static_cast(buffer[6]) << 48;
- value |= static_cast(buffer[5]) << 40;
- value |= static_cast(buffer[4]) << 32;
- value |= static_cast(buffer[3]) << 24;
- value |= static_cast(buffer[2]) << 16;
- value |= static_cast(buffer[1]) << 8;
- value |= static_cast(buffer[0]);
- return value;
-}
-
-int32_t read_int(uint8_t* buffer) {
- // little endian
- int value = 0;
- value |= buffer[3] << 24;
- value |= buffer[2] << 16;
- value |= buffer[1] << 8;
- value |= buffer[0];
- return value;
-}
-
-uint16_t read_short(uint8_t* buffer) {
- // little endian
- uint16_t value = 0;
- value |= buffer[1] << 8;
- value |= buffer[0];
- return value;
-}
-
-int8_t find_char(uint8_t* buffer, char c) {
- for (int8_t len = 0; len < MAX_STRING_BUFFER; len++) {
- if (buffer[len] == c) {
- return len;
- }
- }
- return -1;
-}
-
-// ported from https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py#L16
-std::map unicode_to_byte() {
- std::map byte_to_unicode;
-
- // List of utf-8 byte ranges
- for (int b = static_cast('!'); b <= static_cast('~'); ++b) {
- byte_to_unicode[b] = static_cast(b);
- }
-
- for (int b = 49825; b <= 49836; ++b) {
- byte_to_unicode[b] = static_cast(b);
- }
-
- for (int b = 49838; b <= 50111; ++b) {
- byte_to_unicode[b] = static_cast(b);
- }
- // printf("%d %d %d %d\n", static_cast('¡'), static_cast('¬'), static_cast('®'), static_cast('ÿ'));
- // exit(1);
-
- int n = 0;
- for (int b = 0; b < 256; ++b) {
- if (byte_to_unicode.find(b) == byte_to_unicode.end()) {
- byte_to_unicode[b] = static_cast(256 + n);
- n++;
- }
- }
-
- // byte_encoder = bytes_to_unicode()
- // byte_decoder = {v: k for k, v in byte_encoder.items()}
- std::map byte_decoder;
-
- for (const auto& entry : byte_to_unicode) {
- byte_decoder[entry.second] = entry.first;
- }
-
- byte_to_unicode.clear();
-
- return byte_decoder;
-}
-
-bool is_unused_tensor(std::string name) {
- for (int i = 0; i < UNUSED_MODEL_TENSORS; i++) {
- if (name.find(unused_tensors[i]) == 0) {
- return true;
- }
- }
- return false;
-}
-
-float* calculate_alpha_cumprod(float linear_start = 0.00085f, float linear_end = 0.0120, int timesteps = TIMESTEPS) {
- float* ac = (float*)malloc(timesteps * 4);
- float ls_sqrt = sqrtf(linear_start);
- float le_sqrt = sqrtf(linear_end);
- float amount = le_sqrt - ls_sqrt;
- float product = 1.0f;
- for (int i = 0; i < timesteps; i++) {
- float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
- product *= 1.0f - powf(beta, 2.0f);
- ac[i] = product;
- }
- return ac;
-}
-
-/*
-
- READ PYTORCH CHECKPOINT MODEL
-
-*/
-
-static ggml_type global_type = GGML_TYPE_F32; // all tensors data type
-static bool read_global_type = false;
-
-void exist_in_zip(struct zip_t* zip, const char* f_test, Tensor& tensor) {
- size_t i, n = zip_entries_total(zip);
- for (i = 0; i < n; ++i) {
- zip_entry_openbyindex(zip, i);
- {
- const char* name = zip_entry_name(zip);
- if (strcmp(name, f_test) == 0) {
- tensor.data_offset = i;
- tensor.data_size = zip_entry_size(zip);
- zip_entry_close(zip);
- return;
- }
- }
- zip_entry_close(zip);
- }
-}
-
-bool set_pkl_tensor_props(uint32_t value, struct Tensor& tensor) {
- if (tensor.t_phase == CHECK_SIZE) {
- if (tensor.data_size == value * ggml_type_size(tensor.dtype)) {
- tensor.num_elements = value;
- tensor.t_phase = READ_DIMENS;
- return true;
- } else {
- tensor.t_phase = READ_NAME;
- }
- } else if (tensor.t_phase == READ_DIMENS) {
- if (tensor.n_dims + 1 > 4) { // too many dimens
- tensor.t_phase = READ_NAME;
- tensor.n_dims = 0;
- }
- if (tensor.num_elements % value == 0) {
- tensor.shape[tensor.n_dims] = value;
- tensor.n_dims++;
- }
- }
- return false;
-}
-
-void read_pkl_data_type(char* _name, struct Tensor& tensor) {
- if (!strcmp(_name, "FloatStorage")) {
- if (read_global_type) {
- global_type = GGML_TYPE_F32;
- read_global_type = false;
- }
- tensor.dtype = GGML_TYPE_F32;
- } else if (!strcmp(_name, "HalfStorage")) {
- if (read_global_type) {
- global_type = GGML_TYPE_F16;
- read_global_type = false;
- }
- tensor.dtype = GGML_TYPE_F16;
- }
-}
-
-void read_pkl_string(char* text_str, struct zip_t* zip, std::string dir, struct Tensor& tensor) {
- if (!strcmp(text_str, "storage")) {
- read_global_type = true;
- } else if (strcmp(text_str, "state_dict")) { // no state_dict
- if (tensor.t_phase == READ_DATA) {
- std::string zip_entry_name = dir + "data/" + std::string(text_str);
- exist_in_zip(zip, zip_entry_name.c_str(), tensor);
- tensor.t_phase = tensor.data_size > 0 ? CHECK_SIZE : READ_NAME;
- }
- if (!read_global_type && tensor.t_phase == READ_NAME) {
- tensor.name = text_str;
- tensor.t_phase = READ_DATA;
- tensor.dtype = global_type;
- }
- }
-}
-
-// $ python -m pickletools sd-v1-4/archive/data.pkl | head -n 100
-// 0: \x80 PROTO 2
-// 2: } EMPTY_DICT
-// 3: q BINPUT 0
-// 5: ( MARK
-// 6: X BINUNICODE 'epoch'
-// 16: q BINPUT 1
-// 18: K BININT1 6
-// 20: X BINUNICODE 'global_step'
-// 36: q BINPUT 2
-// 38: J BININT 470000
-// 43: X BINUNICODE 'pytorch-lightning_version'
-// 73: q BINPUT 3
-// 75: X BINUNICODE '1.4.2'
-// 85: q BINPUT 4
-// 87: X BINUNICODE 'state_dict'
-// 102: q BINPUT 5
-// 104: } EMPTY_DICT
-// 105: q BINPUT 6
-// 107: ( MARK
-// 108: X BINUNICODE 'betas'
-// 118: q BINPUT 7
-// 120: c GLOBAL 'torch._utils _rebuild_tensor_v2'
-// 153: q BINPUT 8
-// 155: ( MARK
-// 156: ( MARK
-// 157: X BINUNICODE 'storage'
-// 169: q BINPUT 9
-// 171: c GLOBAL 'torch FloatStorage'
-// 191: q BINPUT 10
-// 193: X BINUNICODE '0'
-// 199: q BINPUT 11
-// 201: X BINUNICODE 'cpu'
-// 209: q BINPUT 12
-// 211: M BININT2 1000
-// 214: t TUPLE (MARK at 156)
-// 215: q BINPUT 13
-// 217: Q BINPERSID
-// 218: K BININT1 0
-// 220: M BININT2 1000
-// ...............................
-// 3201: q BINPUT 250
-// 3203: R REDUCE
-// 3204: q BINPUT 251
-// 3206: X BINUNICODE 'model.diffusion_model.input_blocks.1.1.proj_in.weight'
-// 3264: q BINPUT 252
-// 3266: h BINGET 8
-// 3268: ( MARK
-// 3269: ( MARK
-// 3270: h BINGET 9
-// 3272: h BINGET 10
-// 3274: X BINUNICODE '30'
-// 3281: q BINPUT 253
-// 3283: h BINGET 12
-// 3285: J BININT 102400
-// 3290: t TUPLE (MARK at 3269)
-// 3291: q BINPUT 254
-// 3293: Q BINPERSID
-// 3294: K BININT1 0
-// 3296: ( MARK
-// 3297: M BININT2 320
-// 3300: M BININT2 320
-// 3303: K BININT1 1
-// 3305: K BININT1 1
-// 3307: t TUPLE (MARK at 3296)
-// 3308: q BINPUT 255
-// 3310: ( MARK
-// 3311: M BININT2 320
-// 3314: K BININT1 1
-// 3316: K BININT1 1
-// 3318: K BININT1 1
-// 3320: t TUPLE (MARK at 3310)
-// 3321: r LONG_BINPUT 256
-// 3326: \x89 NEWFALSE
-// 3327: h BINGET 16
-// 3329: ) EMPTY_TUPLE
-// 3330: R REDUCE
-// 3331: r LONG_BINPUT 257
-// 3336: t TUPLE (MARK at 3268)
-// 3337: r LONG_BINPUT 258
-// 3342: R REDUCE
-// 3343: r LONG_BINPUT 259
-// 3348: X BINUNICODE 'model.diffusion_model.input_blocks.1.1.proj_in.bias'
-// 3404: r LONG_BINPUT 260
-// 3409: h BINGET 8
-// 3411: ( MARK
-// 3412: ( MARK
-// 3413: h BINGET 9
-// 3415: h BINGET 10
-// 3417: X BINUNICODE '31'
-
-void read_pkl_props(uint8_t* buffer,
- zip_t* zip,
- std::string dir,
- TensorMap& tensors,
- ConvertParams& params,
- bool root_model,
- TensorTarget target = NONE) {
- if (buffer[0] == 0x80) { // proto
- if (buffer[1] != 2) {
- printf("Unsupported protocol\n");
- return;
- }
- buffer += 2; // 0x80 and version
- char string_buffer[MAX_STRING_BUFFER];
- bool finish = false;
- Tensor tensor;
- // read pickle binary file
- while (!finish) {
- uint8_t opcode = *buffer;
- buffer++;
- // https://github.com/python/cpython/blob/3.7/Lib/pickletools.py#L1048
- // https://github.com/python/cpython/blob/main/Lib/pickle.py#L105
- switch (opcode) {
- case '}': // EMPTY_DICT = b'}' # push empty dict
- break;
- case ']': // EMPTY_LIST = b']' # push empty list
- break;
- // skip unused sections
- case 'h': // BINGET = b'h' # " " " " " " ; " " 1-byte arg
- case 'q': // BINPUT = b'q' # " " " " " ; " " 1-byte arg
- case 'Q': // BINPERSID = b'Q' # " " " ; " " " " stack
- buffer++;
- break;
- case 'r': // LONG_BINPUT = b'r' # " " " " " ; " " 4-byte arg
- buffer += 4;
- break;
- case 0x95: // FRAME = b'\x95' # indicate the beginning of a new frame
- buffer += 8;
- break;
- case 0x94: // MEMOIZE = b'\x94' # store top of the stack in memo
- break;
- case '(': // MARK = b'(' # push special markobject on stack
- break;
- case 'K': // BININT1 = b'K' # push 1-byte unsigned int
- {
- uint8_t value = *buffer;
- if (set_pkl_tensor_props(value, tensor)) {
- buffer++;
- }
- buffer++;
- } break;
- case 'M': // BININT2 = b'M' # push 2-byte unsigned int
- {
- uint16_t value = read_short(buffer);
- if (set_pkl_tensor_props(value, tensor)) {
- buffer++;
- }
- buffer += 2;
- } break;
- case 'J': // BININT = b'J' # push four-byte signed int
- {
- const int32_t value = read_int(buffer);
- if (set_pkl_tensor_props(value, tensor)) {
- buffer++; // skip tuple after read num_elements
- }
- buffer += 4;
- } break;
- case 'X': // BINUNICODE = b'X' # " " " ; counted UTF-8 string argument
- {
- const int32_t len = read_int(buffer);
- buffer += 4;
- memset(string_buffer, 0, MAX_STRING_BUFFER);
- if (len > MAX_STRING_BUFFER) {
- printf("Tensor name very large\n");
- }
- memcpy(string_buffer, buffer, len < MAX_STRING_BUFFER ? len : (MAX_STRING_BUFFER - 1));
- buffer += len;
- read_pkl_string(string_buffer, zip, dir, tensor);
- if (params.verbose) {
- printf("pickle str: %s\n", string_buffer);
- }
- } break;
- case 0x8C: // SHORT_BINUNICODE = b'\x8c' # push short string; UTF-8 length < 256 bytes
- {
- const int8_t len = *buffer;
- buffer++;
- memset(string_buffer, 0, MAX_STRING_BUFFER);
- memcpy(string_buffer, buffer, len);
- buffer += len;
- // printf("String: '%s'\n", string_buffer);
- } break;
- case 'c': // GLOBAL = b'c' # push self.find_class(modname, name); 2 string args
- {
- int8_t len = find_char(buffer, '\n');
- buffer += len + 1;
- len = find_char(buffer, '\n');
- memset(string_buffer, 0, MAX_STRING_BUFFER);
- memcpy(string_buffer, buffer, len);
- buffer += len + 1;
- read_pkl_data_type(string_buffer, tensor);
- // printf("Global: %s\n", string_buffer);
- } break;
- case 0x86: // TUPLE2 = b'\x86' # build 2-tuple from two topmost stack items
- case 0x85: // TUPLE1 = b'\x85' # build 1-tuple from stack top
- case 't': // TUPLE = b't' # build tuple from topmost stack items
- if (tensor.t_phase == READ_DIMENS) {
- if (!is_unused_tensor(tensor.name)) { // ignore unused tensors
- tensor.ptr_idx = (int32_t)params.pkl_fp.size();
- if (target != NONE) {
- tensor.target = target;
- } else if (params.merge_custom_vae) {
- if (root_model) {
- tensor.detect_target(params);
- if (tensor.target == VAE) {
- tensor = Tensor();
- continue; // ignore original vae tensors
- }
- } else {
- tensor.target = VAE;
- tensor.detect_target(params);
- }
- }
- tensors[tensor.name] = tensor;
- }
- // reset
- tensor = Tensor();
- }
- break;
- case '.': // STOP = b'.' # every pickle ends with STOP
- finish = true;
- break;
- default:
- break;
- }
- }
- }
-}
-
-void read_vocab_json(std::map& vocab_map, ConvertParams params) {
- char* vocab_buffer = NULL;
- if (!params.vocab_path.empty()) {
- FILE* fv = std::fopen(params.vocab_path.c_str(), "r");
- if (fv == NULL) {
- printf("Error: failed to open vocab file '%s'\n", params.vocab_path.c_str());
- exit(0);
- }
- fseek(fv, 0, SEEK_END);
- size_t file_size = ftell(fv);
- // return to begin
- fseek(fv, 0, SEEK_SET);
- vocab_buffer = (char*)malloc(file_size);
- sd_fread(vocab_buffer, 1, file_size, fv);
- fclose(fv);
- } else {
- // read embedded vocab
- printf("using embedded vocab\n");
- vocab_buffer = reinterpret_cast(vocab_json);
- }
- json vocab = json::parse(vocab_buffer);
- std::map decoder = unicode_to_byte();
- for (auto& it : vocab.items()) {
- std::string token_str = it.key();
- std::string result = "";
- int id = it.value();
- for (char c : token_str) {
- result += decoder[c];
- }
- vocab_map[id] = result;
- }
-}
-
-/*
-
- PREPROCESS TENSORS
-
-*/
-
-std::string replace_name_by_map(const std::string full_name, std::unordered_map ft_map) {
- std::string result = full_name;
- for (auto it : ft_map) {
- size_t pos = result.find(it.first);
- if (pos != std::string::npos) {
- result = result.replace(pos, it.first.size(), it.second);
- }
- }
- return result;
-}
-
-// hugging face pipeline to legacy stable diffusion
-std::unordered_map unet_convert_map;
-std::unordered_map unet_convert_map_resnet;
-std::unordered_map unet_convert_map_layers;
-std::unordered_map vae_convert_map;
-std::unordered_map clip_convert_map;
-std::unordered_map lora_fix_map;
-
-std::string convert_unet_to_original(std::string name, ConvertParams params) {
- bool resnet_tensor = name.find("resnets") != std::string::npos;
- const char* separator = params.lora ? "." : "_";
- if (unet_convert_map.empty()) {
- unet_convert_map[format("time%sembedding.linear%s1.weight", separator, separator)] = "time_embed.0.weight";
- unet_convert_map[format("time%sembedding.linear%s1.bias", separator, separator)] = "time_embed.0.bias";
- unet_convert_map[format("time%sembedding.linear%s2.weight", separator, separator)] = "time_embed.2.weight";
- unet_convert_map[format("time%sembedding.linear%s2.bias", separator, separator)] = "time_embed.2.bias";
- unet_convert_map[format("conv%sin.weight", separator)] = "input_blocks.0.0.weight";
- unet_convert_map[format("conv%sin.bias", separator)] = "input_blocks.0.0.bias";
- unet_convert_map[format("conv%snorm%sout.weight", separator, separator)] = "out.0.weight";
- unet_convert_map[format("conv%snorm%sout.bias", separator, separator)] = "out.0.bias";
- unet_convert_map[format("conv%sout.weight", separator)] = "out.2.weight";
- unet_convert_map[format("conv%sout.bias", separator)] = "out.2.bias";
- }
-
- // resnet
- if (unet_convert_map_resnet.empty() && resnet_tensor) {
- unet_convert_map_resnet["norm1"] = "in_layers.0";
- unet_convert_map_resnet["conv1"] = "in_layers.2";
- unet_convert_map_resnet["norm2"] = "out_layers.0";
- unet_convert_map_resnet["conv2"] = "out_layers.3";
- unet_convert_map_resnet[format("time%semb%sproj", separator, separator)] = "emb_layers.1";
- unet_convert_map_resnet[format("conv%sshortcut", separator)] = "skip_connection";
- }
-
- if (unet_convert_map_layers.empty()) {
- for (int i = 0; i < 4; i++) {
- for (int j = 0; j < 2; j++) {
- unet_convert_map_layers[format("down%sblocks.%i.resnets.%i.", separator, i, j)] = format("input_blocks.%i.0.", 3 * i + j + 1);
- if (i < 3) {
- unet_convert_map_layers[format("down%sblocks.%i.attentions.%i.", separator, i, j)] = format("input_blocks.%i.1.", 3 * i + j + 1);
- }
- }
- for (int j = 0; j < 3; j++) {
- unet_convert_map_layers[format("up%sblocks.%i.resnets.%i.", separator, i, j)] = format("output_blocks.%i.0.", 3 * i + j);
- if (i > 0) {
- unet_convert_map_layers[format("up%sblocks.%i.attentions.%i.", separator, i, j)] = format("output_blocks.%i.1.", 3 * i + j);
- }
- }
- if (i < 3) {
- unet_convert_map_layers[format("down%sblocks.%i.downsamplers.0.conv.", separator, i)] = format("input_blocks.%i.0.op.", 3 * (i + 1));
- unet_convert_map_layers[format("up%sblocks.%i.upsamplers.0.", separator, i)] = format("output_blocks.%i.%i.", 3 * i + 2, i == 0 ? 1 : 2);
- }
- }
- unet_convert_map_layers[format("mid%sblock.attentions.0.", separator)] = "middle_block.1.";
- for (int j = 0; j < 2; j++) {
- unet_convert_map_layers[format("mid%sblock.resnets.%i.", separator, j)] = format("middle_block.%i.", 2 * j);
- }
- }
- if (params.lora) {
- unet_convert_map[".unet."] = ".model.diffusion_model.";
- }
-
- std::string result = replace_name_by_map(name, unet_convert_map);
- result = replace_name_by_map(result, unet_convert_map_layers);
- if (resnet_tensor) {
- result = replace_name_by_map(result, unet_convert_map_resnet);
- }
- return result;
-}
-
-std::string convert_vae_to_original(std::string name, ConvertParams params) {
- std::unordered_map vae_map;
- bool hf_attention = name.find("attentions") != std::string::npos;
- if (vae_convert_map.empty()) {
- vae_convert_map["conv_shortcut"] = "nin_shortcut";
- vae_convert_map["conv_norm_out"] = "norm_out";
- vae_convert_map["mid_block.attentions.0."] = "mid.attn_1.";
- for (int i = 0; i < 4; i++) {
- for (int j = 0; j < 2; j++) {
- vae_convert_map["encoder.down_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + "."] = "encoder.down." + std::to_string(i) + ".block." + std::to_string(j) + ".";
- }
- if (i < 2) {
- vae_convert_map["mid_block.resnets." + std::to_string(i) + "."] = "mid.block_" + std::to_string(i + 1) + ".";
- }
- if (i < 3) {
- vae_convert_map["down_blocks." + std::to_string(i) + ".downsamplers.0."] = "down." + std::to_string(i) + ".downsample.";
- vae_convert_map["up_blocks." + std::to_string(i) + ".upsamplers.0."] = "up." + std::to_string(3 - i) + ".upsample.";
- }
- for (int j = 0; j < 3; j++) {
- vae_convert_map["decoder.up_blocks." + std::to_string(i) + ".resnets." + std::to_string(j) + "."] = "decoder.up." + std::to_string(3 - i) + ".block." + std::to_string(j) + ".";
- }
- }
- }
-
- if (hf_attention || params.version == VERSION_2_x) {
- vae_convert_map["to_k."] = "k.";
- vae_convert_map["to_q."] = "q.";
- vae_convert_map["to_v."] = "v.";
- vae_convert_map["to_out.0."] = "proj_out.";
- }
-
- if (hf_attention) {
- vae_convert_map["key."] = "k.";
- vae_convert_map["query."] = "q.";
- vae_convert_map["value."] = "v.";
- vae_convert_map["group_norm."] = "norm.";
- vae_convert_map["proj_attn."] = "proj_out.";
- }
-
- return replace_name_by_map(name, vae_convert_map);
-}
-
-std::string convert_clip_to_hf_clip(std::string name, ConvertParams params) {
- std::string separator = params.lora ? "." : "_";
- if (clip_convert_map.empty()) {
- if (params.version == VERSION_2_x) {
- clip_convert_map[".model."] = ".transformer.text_model.";
- clip_convert_map["transformer.resblocks."] = "encoder.layers.";
- clip_convert_map["attn.out_proj"] = "self_attn.out_proj";
- clip_convert_map["ln_final."] = "final_layer_norm.";
- clip_convert_map["token_embedding.weight"] =
- "embeddings.token_embedding.weight";
- clip_convert_map["positional_embedding"] =
- "embeddings.position_embedding.weight";
- } else {
- clip_convert_map["resblocks."] = "text_model.encoder.layers.";
- if (!params.lora) {
- clip_convert_map[".attn."] = ".self_attn.";
- }
- clip_convert_map["ln_final."] = "transformer.text_model.final_layer_norm.";
- if (name == "token_embedding.weight") {
- return "transformer.text_model.embeddings.token_embedding.weight";
- } else if (name == "positional_embedding") {
- return "transformer.text_model.embeddings.position_embedding.weight";
- }
- }
- clip_convert_map["ln_1."] = "layer_norm1.";
- clip_convert_map["ln_2."] = "layer_norm2.";
- clip_convert_map[".c_fc."] = ".fc1.";
- clip_convert_map[".c_proj."] = ".fc2.";
- }
- if (params.lora) {
- clip_convert_map["te.text.model"] = "cond_stage_model.transformer.text_model";
- }
- // SD XL to SD normal
- if (params.version == VERSION_XL) {
- clip_convert_map["conditioner.embedders.0.transformer.text_model"] = "cond_stage_model.transformer.text_model";
- clip_convert_map["conditioner.embedders.1.model"] = "cond_stage_model.g.transformer.text_model";
- }
- return replace_name_by_map(name, clip_convert_map);
-}
-
-std::string fix_lora_names(std::string name) {
- // lora fix names
- if (lora_fix_map.empty()) {
- lora_fix_map["self.attn"] = "self_attn";
- lora_fix_map["proj.in"] = "proj_in";
- lora_fix_map["proj.out"] = "proj_out";
- lora_fix_map["out.proj"] = "out_proj";
- lora_fix_map["transformer.blocks"] = "transformer_blocks";
- lora_fix_map["q.proj"] = "q_proj";
- lora_fix_map["k.proj"] = "k_proj";
- lora_fix_map["v.proj"] = "v_proj";
- lora_fix_map["to.q"] = "to_q";
- lora_fix_map["to.k"] = "to_k";
- lora_fix_map["to.v"] = "to_v";
- lora_fix_map[".to.out"] = ".to_out";
- lora_fix_map[".lora.down."] = ".lora_down.";
- lora_fix_map[".lora.up."] = ".lora_up.";
- }
- return replace_name_by_map(name, lora_fix_map);
-}
-
-void* fetch_data(Tensor tensor, ConvertParams params) {
- if (!tensor.data) { // fetch tensor data from zip (.ckpt) or file stream (.safetensors)
- if (tensor.ptr_type == CHECKPOINT) {
- zip_entry_openbyindex(params.pkl_fp[tensor.ptr_idx], tensor.data_offset);
- size_t buf_sz;
- if (zip_entry_read(params.pkl_fp[tensor.ptr_idx], &tensor.data, &buf_sz) < 0) {
- return NULL;
- }
- } else {
-#ifdef _WIN32
- _fseeki64(params.sf_fp[tensor.ptr_idx], (__int64)tensor.data_offset, SEEK_SET);
-#else
- std::fseek(params.sf_fp[tensor.ptr_idx], (long)tensor.data_offset, SEEK_SET);
-#endif
- tensor.data = malloc(tensor.data_size);
- sd_fread(tensor.data, 1, tensor.data_size, params.sf_fp[tensor.ptr_idx]);
- }
- }
- return tensor.data;
-}
-
-std::tuple split_qkv_tensor(Tensor qkv_tensor, void* qkv_data) {
- const int ne0 = qkv_tensor.shape[0] / 3; // split in 3 tensors: query, key, value
- const int ne1 = qkv_tensor.shape[1];
- const int32_t num_elements = ne0 * ne1;
- ggml_type dtype = qkv_tensor.dtype;
- const int n_dims = qkv_tensor.n_dims;
-
- size_t chunk_size = (size_t)num_elements * ggml_type_size(qkv_tensor.dtype);
-
- int32_t ne[4] = {ne0, ne1, 1, 1};
-
- Tensor q = Tensor("", dtype, chunk_size, ne, n_dims, num_elements, true); // query
- Tensor k = Tensor("", dtype, chunk_size, ne, n_dims, num_elements, true); // key
- Tensor v = Tensor("", dtype, chunk_size, ne, n_dims, num_elements, true); // value
-
- // make a view of original tensor data
- q.data = qkv_data;
- k.data = ((char*)qkv_data) + chunk_size;
- v.data = ((char*)qkv_data) + chunk_size * 2;
- return {q, k, v};
-}
-
-void preprocess_tensors(
- TensorMap& src,
- std::vector& dst,
- ConvertParams& params) {
- printf("preprocessing %zu tensors\n", src.size());
- for (auto& it : src) {
- std::string name = it.first;
- Tensor tensor = it.second;
- if (!tensor.detect_target(params)) {
- if (tensor.target == CLIP && name.find("cond_stage_model.transformer.text_model") == std::string::npos) {
- if (name.find("text_model.") == 0) {
- tensor.name = "cond_stage_model.transformer." + name;
- } else {
- tensor.name = "cond_stage_model.transformer.text_model" + name;
- }
- } else if (name.find("model.diffusion_model.") == std::string::npos && tensor.target == UNET) {
- tensor.name = "model.diffusion_model." + name;
- } else if (name.find("first_stage_model.") == std::string::npos && tensor.target == VAE) {
- tensor.name = "first_stage_model." + name;
- }
- }
-
- if (tensor.target == VAE) {
- tensor.name = convert_vae_to_original(tensor.name, params);
-
- // convert vae attn block linear to conv2d 1x1
- if (params.vae && name.find("first_stage_model.") == std::string::npos) {
- tensor.name = "first_stage_model." + tensor.name;
- }
-
- if (tensor.name.find("attn_1") != std::string::npos) {
- if (tensor.n_dims == 2) {
- tensor.n_dims += 2;
- if (params.verbose) {
- printf("linear to conv2d %s\n", tensor.name.c_str());
- }
- }
- }
- }
- if (tensor.target == CLIP) {
- tensor.name = convert_clip_to_hf_clip(tensor.name, params);
- if (params.version == VERSION_2_x) {
- size_t fw = tensor.name.find("attn.in_proj_weight");
- size_t fb = tensor.name.find("attn.in_proj_bias");
- if (fw != std::string::npos) {
- Tensor q, k, v;
- std::tie(q, k, v) = split_qkv_tensor(tensor, fetch_data(tensor, params));
- for (int i = 0; i < 3; i++) {
- Tensor attn_t = i == 0 ? q : (i == 1 ? k : v);
- attn_t.name = tensor.name.substr(0, fw) + kqv_self[i];
- dst.push_back(attn_t);
- if (params.verbose) {
- printf("split %s => %s\n", it.first.c_str(), attn_t.name.c_str());
- }
- }
- continue;
- } else if (fb != std::string::npos) {
- Tensor q, k, v;
- std::tie(q, k, v) = split_qkv_tensor(tensor, fetch_data(tensor, params));
- for (int i = 0; i < 3; i++) {
- Tensor attn_t = i == 0 ? q : (i == 1 ? k : v);
- attn_t.name = tensor.name.substr(0, fb) + kqv_self[i + 3];
- dst.push_back(attn_t);
- if (params.verbose) {
- printf("split %s => %s\n", it.first.c_str(), attn_t.name.c_str());
- }
- }
- continue;
- }
- }
- } else if (tensor.target == UNET) {
- tensor.name = convert_unet_to_original(tensor.name, params);
- if (tensor.name.find("proj_in.weight") != std::string::npos ||
- tensor.name.find("proj_out.weight") != std::string::npos) {
- if (tensor.n_dims == 2) {
- tensor.n_dims += 2;
- if (params.verbose) {
- printf("linear to conv2d %s\n", tensor.name.c_str());
- }
- }
- }
- }
-
- if (params.lora) {
- tensor.name = fix_lora_names(tensor.name);
- }
-
- if (is_unused_tensor(tensor.name)) { // discard tensors
- continue;
- }
-
- if (params.lora) {
- int pos = (int)name.find("lora.up.weight");
- if (pos != std::string::npos) {
- std::string key = name.substr(0, pos) + "alpha";
- if (params.lora_alphas.find(key) != params.lora_alphas.end()) {
- int kpos = (int)tensor.name.find("lora_up");
- std::string target = tensor.name.substr(0, kpos) + "alpha";
- params.alpha_keys.emplace(target);
- params.alpha_values.push_back(params.lora_alphas[key]);
- } else {
- printf("WARNING: missing alpha '%s'\n", key.c_str());
- }
- }
- }
- dst.push_back(tensor);
- }
- if (params.lora) {
- params.lora_alphas.clear();
- }
-}
-
-void* convert_tensor(void* source, Tensor tensor, ggml_type dst_type) {
- if (tensor.dtype == GGML_TYPE_F32 && dst_type == GGML_TYPE_F16) {
- ggml_fp16_t* dest = (ggml_fp16_t*)malloc(tensor.num_elements * sizeof(ggml_fp16_t));
- ggml_fp32_to_fp16_row((float*)source, dest, tensor.num_elements);
- return dest;
- } else if (tensor.dtype == GGML_TYPE_F16 && dst_type == GGML_TYPE_F32) {
- float* dest = (float*)malloc(tensor.num_elements * sizeof(float));
- ggml_fp16_to_fp32_row((ggml_fp16_t*)source, dest, tensor.num_elements);
- return dest;
- } else if (
- dst_type == GGML_TYPE_Q4_0 ||
- dst_type == GGML_TYPE_Q4_1 ||
- dst_type == GGML_TYPE_Q5_0 ||
- dst_type == GGML_TYPE_Q5_1 ||
- dst_type == GGML_TYPE_Q8_0) {
- // in development
- int num_blocks = tensor.shape[0] * tensor.shape[1] / ggml_blck_size(dst_type);
- float* src = nullptr;
- if (tensor.dtype == GGML_TYPE_F16) {
- src = (float*)malloc(tensor.num_elements * sizeof(float));
- ggml_fp16_to_fp32_row((ggml_fp16_t*)source, src, tensor.num_elements);
- } else {
- src = (float*)source;
- }
- int64_t* hist = new int64_t[16];
- void* quantized = malloc(ggml_type_size(dst_type) * num_blocks);
- ggml_quantize_chunk(dst_type, src, quantized, 0, tensor.num_elements, hist);
- if (tensor.dtype == GGML_TYPE_F16) {
- free(src);
- }
- delete[] hist;
- return quantized;
- } else {
- throw std::invalid_argument("unsupported conversion");
- }
- return NULL;
-}
-
-void convert_to_gguf(TensorMap& tensors, ConvertParams& params) {
- if (params.lora && params.out_type != GGML_TYPE_F32 && params.out_type != GGML_TYPE_F16) {
- printf("Error: The LoRa conversion only supports f32 and f16.\n");
- return;
- }
- if (!params.vae &&
- tensors.find("first_stage_model.post_quant_conv.bias") == tensors.end() && // is not a stable diffusion model
- tensors.find("post_quant_conv.bias") != tensors.end() && !params.from_folder &&
- params.custom_vae_path.empty()) { // has a tensor of VAE
- params.vae = true;
- printf("VAE detected\n");
- }
-
- if (!params.lora && tensors.find("alphas_cumprod") == tensors.end()) {
- params.generate_alphas_cumprod = true;
- }
-
- std::vector processed_tensors;
-
- if (!params.lora) {
- if (tensors.find("cond_stage_model.model.token_embedding.weight") != tensors.end()) {
- params.version = VERSION_2_x;
- printf("Stable Diffusion 2.x - %s\n", params.model_name.c_str());
- } else if (tensors.find("conditioner.embedders.0.transformer.text_model.embeddings.position_embedding.weight") != tensors.end()) {
- params.version = VERSION_XL;
- printf("Stable Diffusion XL - %s\n", params.model_name.c_str());
- } else {
- printf("Stable Diffusion 1.x - %s\n", params.model_name.c_str());
- }
- }
-
- preprocess_tensors(tensors, processed_tensors, params);
-
- gguf_context* g_ctx = gguf_init_empty();
-
- if (params.lora) {
- gguf_set_val_str(g_ctx, "sd.lora.name", params.model_name.c_str());
- gguf_set_val_i32(g_ctx, "sd.lora.dtype", (int)params.out_type);
- gguf_set_val_i32(g_ctx, "sd.lora.type", (int)params.lora_type);
-
- printf("writing %zu lora alphas\n", params.alpha_keys.size());
- std::vector dest;
- for (const auto& src : params.alpha_keys) {
- dest.push_back(src.c_str());
- }
- gguf_set_arr_str(g_ctx, "sd.lora.alphas_k", dest.data(), (int)dest.size());
- gguf_set_arr_data(g_ctx, "sd.lora.alphas_v", GGUF_TYPE_FLOAT32, params.alpha_values.data(), (int)params.alpha_values.size());
- } else if (params.vae) {
- gguf_set_val_str(g_ctx, "sd.vae.name", params.model_name.c_str());
- gguf_set_val_i32(g_ctx, "sd.vae.dtype", (int)params.out_type);
- gguf_set_val_i32(g_ctx, "sd.vae.type", (int)params.lora_type);
- } else {
- // process vocab
- std::map vocab_map;
- std::vector vocab_data;
- read_vocab_json(vocab_map, params);
-
- for (int i = 0; i < vocab_map.size(); i++) {
- vocab_data.push_back(vocab_map[i].c_str());
- }
-
- gguf_set_val_str(g_ctx, "sd.model.name", params.model_name.c_str());
- gguf_set_val_i32(g_ctx, "sd.model.dtype", (int)params.out_type);
- gguf_set_val_i8(g_ctx, "sd.model.version", (int)params.version);
-
- // write vocab
- if (params.verbose) {
- printf("writing vocab: %zu tokens\n", vocab_data.size());
- }
- gguf_set_arr_str(g_ctx, "sd.vocab.tokens", vocab_data.data(), (int)vocab_data.size());
- }
-
- printf("converting %zu tensors\n", processed_tensors.size());
-
- // write tensors
- ggml_context* ctx = ggml_init({(processed_tensors.size() + (params.generate_alphas_cumprod ? 1 : 0)) * ggml_tensor_overhead(), NULL, true}); // no alloc data
- int num_clip_tensors = 0, num_unet_tensors = 0, num_vae_tensors = 0;
- size_t total_org_model = 0, total_conv_model = 0;
-
- for (Tensor& tensor : processed_tensors) {
- if (tensor.name.size() >= GGML_MAX_NAME) {
- printf("Error: tensor name very large '%s', might not be supported anyway by stable-diffusion.cpp\n", tensor.name.c_str());
- exit(0);
- return;
- }
- if (tensor.target == CLIP) {
- num_clip_tensors++;
- } else if (tensor.target == UNET) {
- num_unet_tensors++;
- } else if (tensor.target == VAE) {
- num_vae_tensors++;
- }
- ggml_type dest_type = GGML_TYPE_F32;
- if (tensor.name.find(".weight") && tensor.n_dims == 2) { // allow quantize only weights
- dest_type = params.out_type;
- } else if (tensor.n_dims == 4) {
- dest_type = GGML_TYPE_F16;
- }
- ggml_tensor* gg_tensor = ggml_new_tensor(ctx, dest_type, tensor.n_dims, tensor.inverse_shape());
- ggml_set_name(gg_tensor, tensor.name.c_str());
- void* source = fetch_data(tensor, params);
- void* dest = NULL;
- if (params.verbose) {
- printf("converting: %s | %s => %s\n", tensor.name.c_str(), ggml_type_name(tensor.dtype), ggml_type_name(dest_type));
- }
- if (tensor.dtype == dest_type) {
- dest = source;
- } else {
- // convert
- dest = convert_tensor(source, tensor, dest_type);
- if (!tensor.is_view) {
- free(source);
- }
- }
- gguf_add_tensor(g_ctx, gg_tensor);
- gguf_set_tensor_data(g_ctx, tensor.name.c_str(), dest, ggml_nbytes(gg_tensor));
- total_org_model += tensor.data_size;
- total_conv_model += ggml_nbytes(gg_tensor);
- }
- if (params.generate_alphas_cumprod) {
- ggml_tensor* gg_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
- ggml_set_name(gg_tensor, "alphas_cumprod");
- gguf_add_tensor(g_ctx, gg_tensor);
- float* dest = calculate_alpha_cumprod();
- gguf_set_tensor_data(g_ctx, "alphas_cumprod", dest, ggml_nbytes(gg_tensor));
- printf("alphas_cumprod computed\n");
- }
- printf("\nCLIP Model Tensor count: %i\nUNET Model Tensor count: %i\nVAE Model Tensor count: %i\n\nsaving gguf file\n",
- num_clip_tensors,
- num_unet_tensors,
- num_vae_tensors);
- if (params.output_path.empty()) {
- size_t last = params.model_path.find_last_of("/\\");
- params.model_name = params.model_path.substr(last + 1);
- last = params.from_folder ? params.model_name.length() : params.model_name.find_last_of(".");
- if (!params.lora) {
- params.output_path = params.model_name.substr(0, last) + "-" + ggml_type_name(params.out_type) + ".gguf";
- } else {
- params.output_path = params.model_name.substr(0, last) + ".gguf";
- }
- }
- gguf_write_to_file(g_ctx, params.output_path.c_str(), false, true);
- printf("model saved '%s' correctly.\n", params.output_path.c_str());
- ggml_free(ctx);
- gguf_free(g_ctx);
-}
-
-void load_checkpoint(const char* file_name, TensorMap& tensors, ConvertParams& params, bool root_model, TensorTarget target = NONE) {
- struct zip_t* zip = zip_open(file_name, 0, 'r');
- {
- int i, n = (int)zip_entries_total(zip);
- for (i = 0; i < n; ++i) {
- zip_entry_openbyindex(zip, i);
- {
- std::string name = zip_entry_name(zip);
- int isdir = zip_entry_isdir(zip);
- unsigned long long size = zip_entry_size(zip);
- unsigned int crc32 = zip_entry_crc32(zip);
- size_t res = name.find("data.pkl");
- if (res != std::string::npos) {
- std::string dir_ = name.substr(0, res);
- void* pkl_data = NULL;
- size_t pkl_size;
- zip_entry_read(zip, &pkl_data, &pkl_size);
- read_pkl_props((uint8_t*)pkl_data, zip, dir_, tensors, params, root_model, target);
- }
- }
- zip_entry_close(zip);
- }
- }
- params.pkl_fp.push_back(zip);
-}
-
-void load_safetensors(FILE* fp, int64_t metadata_size, TensorMap& tensors, ConvertParams& params, bool root_model, TensorTarget target = NONE) {
- std::fseek(fp, 8, SEEK_SET); // from begin
-
- char* metadata_buffer = new char[metadata_size + 1];
- memset(metadata_buffer, 0, metadata_size + 1);
- sd_fread(metadata_buffer, 1, metadata_size, fp);
- json sf_mt = json::parse(metadata_buffer);
-
- int data_begin = 8 + (int)metadata_size;
- for (json::iterator it = sf_mt.begin(); it != sf_mt.end(); ++it) {
- std::string tensor_name = it.key();
- json tensor_props = it.value();
-
- // auto detect lora
- if (!params.lora) {
- if ((tensor_name == "__metadata__" && tensor_props.contains("ss_network_module")) ||
- tensor_name.find("lora_") == 0 &&
- (tensor_name.find("lora_up.weight") != std::string::npos ||
- tensor_name.find("lora_down.weight") != std::string::npos ||
- tensor_name.find(".alpha") != std::string::npos)) {
- params.lora = true;
- printf("LoRA detected\n");
- }
- }
-
- if (tensor_props.contains("dtype") && !is_unused_tensor(tensor_name)) { // check if there dtype param
- int n_dims = (int)tensor_props["shape"].size();
- std::string dtype = tensor_props["dtype"];
- size_t start_data = tensor_props["data_offsets"][0].get();
- size_t end_data = tensor_props["data_offsets"][1].get();
-
- if (params.lora) {
- if (params.lora_type == LORA_NONE) {
- if (tensor_name.find("lora_up.weight") != std::string::npos) {
- params.lora_type = LORA_REGULAR;
- printf("Lora type Regular\n");
- } else if (tensor_name.find("lora.up.weight") != std::string::npos) {
- params.lora_type = LORA_DIFFUSERS;
- printf("Lora type Diffusers\n");
- } else if (tensor_name.find("lora_linear_layer.up.weight") != std::string::npos) {
- params.lora_type = LORA_TRANSFORMERS;
- printf("Lora type Transformers\n");
- }
- }
- // replace all '_' to '.'
- for (char& c : tensor_name) {
- if (c == '_') {
- c = '.';
- }
- }
- }
-
- // collect alphas
- if (params.lora &&
- n_dims == 0 &&
- tensor_name.find(".alpha") != std::string::npos) {
- std::fseek(fp, data_begin + (int)start_data, SEEK_SET);
- if (dtype == "F16") {
- ggml_fp16_t val;
- sd_fread(&val, 1, sizeof(val), fp);
- params.lora_alphas[tensor_name] = ggml_fp16_to_fp32(val);
- } else if (dtype == "F32") {
- float val;
- sd_fread(&val, 1, sizeof(val), fp);
- params.lora_alphas[tensor_name] = val;
- } else if (dtype == "BF16") { // force float 32 bits
- uint16_t val;
- sd_fread(&val, 1, sizeof(val), fp);
- params.lora_alphas[tensor_name] = bfloat16_to_fp32(val);
- }
- continue;
- }
-
- Tensor tensor;
- tensor.name = tensor_name;
- tensor.n_dims = n_dims;
- tensor.ptr_idx = (int)params.sf_fp.size();
- if (target != NONE) {
- tensor.target = target;
- } else if (params.merge_custom_vae) {
- if (root_model) {
- tensor.detect_target(params);
- if (tensor.target == VAE) {
- continue; // ignore original vae tensors
- }
- } else {
- tensor.target = VAE;
- tensor.detect_target(params);
- }
- }
- tensor.ptr_type = SAFETENSOR;
- tensor.data_size = end_data - start_data;
- if (dtype == "F16") {
- tensor.dtype = GGML_TYPE_F16;
- } else if (dtype == "F64") { // force float 32 bits
- void* data = (void*)malloc(tensor.data_size);
- std::fseek(fp, data_begin + (int)start_data, SEEK_SET);
- sd_fread(data, 1, tensor.data_size, fp);
- tensor.data_size /= 2;
- tensor.data = malloc(tensor.data_size);
- int ne = (int)tensor.data_size / (int)ggml_type_size(tensor.dtype);
- for (int i = 0; i < ne; i++) {
- ((float*)tensor.data)[i] = (float)((double*)data)[i];
- }
- free(data);
- } else if (dtype == "BF16") { // force float 32 bits
- void* data = (void*)malloc(tensor.data_size);
- std::fseek(fp, data_begin + (int)start_data, SEEK_SET);
- sd_fread(data, 1, tensor.data_size, fp);
- tensor.data_size *= 2;
- tensor.data = malloc(tensor.data_size);
- int ne = (int)tensor.data_size / (int)ggml_type_size(tensor.dtype);
- for (int i = 0; i < ne; i++) {
- ((float*)tensor.data)[i] = bfloat16_to_fp32(((uint16_t*)data)[i]);
- }
- free(data);
- } else if (dtype != "F32") {
- printf("unsupported model data type: %s", dtype.c_str());
- return;
- }
-
- for (uint8_t i = 0; i < n_dims; i++) {
- tensor.shape[i] = tensor_props["shape"][i];
- }
-
- tensor.num_elements = (int32_t)tensor.data_size / (int32_t)ggml_type_size(tensor.dtype);
- tensor.data_offset = data_begin + start_data;
- tensors[tensor_name] = tensor;
- }
- }
-
- // finished read metadata
- params.sf_fp.push_back(fp);
-}
-
-void load_tensors_from_model(std::string path, TensorMap& tensors, ConvertParams& params, bool root_model, TensorTarget target = NONE) {
- // check if the model is safetensor or pytorch checkpoint
- FILE* fp = std::fopen(path.c_str(), "rb");
- if (!fp) {
- printf("Fail to open file: %s", params.model_path.c_str());
- return;
- }
- std::fseek(fp, 0, SEEK_END);
- size_t file_size = ftell(fp);
- // return to begin
- std::fseek(fp, 0, SEEK_SET);
- // read first 9 bytes
- uint8_t buffer_[9];
- sd_fread(buffer_, 1, 9, fp);
- int64_t safe_tensor_metadata_size = read_long(buffer_);
- bool safe_tensor = false;
- if (
- buffer_[8] == '{' &&
- safe_tensor_metadata_size > 0 &&
- safe_tensor_metadata_size < (int64_t)file_size) { // begin safetensor metadata
- size_t offset = safe_tensor_metadata_size + /* long */ 8L - 1L;
-#ifdef _WIN32
- _fseeki64(fp, (__int64)offset, SEEK_SET);
-#else
- std::fseek(fp, (long)offset, SEEK_SET);
-#endif
- sd_fread(buffer_, 1, 1, fp);
- safe_tensor = buffer_[0] == '}' || buffer_[0] == ' ';
- } else {
- std::fclose(fp);
- }
- printf("loading model '%s'\n", path.c_str());
- printf("model type: %s\n", safe_tensor ? "safetensors" : "checkpoint");
- if (safe_tensor) {
- load_safetensors(fp, safe_tensor_metadata_size, tensors, params, root_model, target);
- } else {
- load_checkpoint(params.model_path.c_str(), tensors, params, root_model, target);
- }
-}
-
-void convert_model(ConvertParams& params) {
- TensorMap loaded_tensors;
- size_t last = params.model_path.find_last_of("/\\");
- params.model_name = params.model_path.substr(last + 1);
- if (params.from_folder) {
- // Hardcoded in https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
- std::string diff_clip_path = params.model_path + "/text_encoder/model.safetensors";
- std::string diff_unet_path = params.model_path + "/unet/diffusion_pytorch_model.safetensors";
- std::string diff_vae_path = params.model_path + "/vae/diffusion_pytorch_model.safetensors";
- if (file_exists(diff_clip_path)) {
- load_tensors_from_model(diff_clip_path, loaded_tensors, params, true, CLIP);
- } else {
- printf("ERROR: missing CLIP model: %s\n", diff_clip_path.c_str());
- exit(0);
- }
- if (file_exists(diff_unet_path)) {
- load_tensors_from_model(diff_unet_path, loaded_tensors, params, true, UNET);
- } else {
- printf("ERROR: missing UNET model: %s\n", diff_unet_path.c_str());
- exit(0);
- }
- if (file_exists(diff_vae_path)) {
- load_tensors_from_model(diff_vae_path, loaded_tensors, params, true, VAE);
- } else {
- printf("ERROR: missing VAE model: %s\n", diff_vae_path.c_str());
- exit(0);
- }
- } else {
- load_tensors_from_model(params.model_path.c_str(), loaded_tensors, params, true);
- if (params.merge_custom_vae) {
- load_tensors_from_model(params.custom_vae_path.c_str(), loaded_tensors, params, false);
- }
- }
- convert_to_gguf(loaded_tensors, params);
-}
-
-void print_usage(int argc, const char* argv[]) {
- printf("usage: %s [MODEL_PATH] --type [OUT_TYPE] [arguments]\n", argv[0]);
- printf("Model supported for conversion: .safetensors models or .ckpt checkpoints models\n");
- printf("\n");
- printf("arguments:\n");
- printf(" -h, --help show this help message and exit\n");
- printf(" -o, --out [FILENAME] path or name to converted model\n");
- printf(" --vocab [FILENAME] path to custom vocab.json (usually unnecessary)\n");
- printf(" -v, --verbose print processing info - dev info\n");
- printf(" -l, --lora force read the model as a LoRA\n");
- printf(" --vae [FILENAME] merge a custom VAE\n");
- printf(" -t, --type [OUT_TYPE] output format (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)\n");
-}
-
-bool parse_params(int argc, const char* argv[], ConvertParams& params) {
- params.model_path = argv[1];
- if (is_directory(params.model_path)) {
- params.from_folder = true;
- // if the model path ends with '/' ignore it
- if (params.model_path.size() > 0 && params.model_path.back() == '/') {
- params.model_path.pop_back();
- }
- printf("loading diffusers model\n");
- }
- for (int i = 2; i < argc; i++) {
- std::string arg = argv[i];
- if (arg == "-o" || arg == "--out") {
- if (++i >= argc) {
- break;
- }
- params.output_path = argv[i];
- if (params.output_path.find(".gguf") == std::string::npos) {
- params.output_path = params.output_path + ".gguf";
- }
- } else if (arg == "--vocab") {
- if (++i >= argc) {
- break;
- }
- params.vocab_path = argv[i];
- } else if (arg == "-l" || arg == "--lora") {
- params.lora = true;
- } else if (arg == "-v" || arg == "--verbose") {
- params.verbose = true;
- } else if (arg == "--vae") {
- if (++i >= argc) {
- break;
- }
- params.custom_vae_path = argv[i];
- if (file_exists(params.custom_vae_path)) {
- params.merge_custom_vae = true;
- printf("merge custom vae '%s'\n", params.custom_vae_path.c_str());
- }
- } else if (arg == "--type" || arg == "-t") {
- if (++i >= argc) {
- printf("specify the output format\n");
- exit(1);
- }
- std::string fmt_select = argv[i];
- if (fmt_select == "f32") {
- params.out_type = GGML_TYPE_F32;
- } else if (fmt_select == "f16") {
- params.out_type = GGML_TYPE_F16;
- } else if (fmt_select == "q4_0") {
- params.out_type = GGML_TYPE_Q4_0;
- } else if (fmt_select == "q4_1") {
- params.out_type = GGML_TYPE_Q4_1;
- } else if (fmt_select == "q5_0") {
- params.out_type = GGML_TYPE_Q5_0;
- } else if (fmt_select == "q5_1") {
- params.out_type = GGML_TYPE_Q5_1;
- } else if (fmt_select == "q8_0") {
- params.out_type = GGML_TYPE_Q8_0;
- } else {
- fprintf(stderr, "error: invalid output format %s, must be one of [f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0]\n",
- fmt_select.c_str());
- exit(1);
- }
- } else if (arg == "-h" || arg == "--help") {
- print_usage(argc, argv);
- return false;
- } else {
- fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
- print_usage(argc, argv);
- exit(1);
- }
- }
- if (params.model_path.empty()) {
- fprintf(stderr, "error: missing model input path\n");
- print_usage(argc, argv);
- exit(1);
- }
- return true;
-}
-
-// support safetensors and ckpt (pikle)
-
-int main(int argc, const char* argv[]) {
- ConvertParams params;
- if (argc > 2) {
- // needed to initialize f16 tables
- {
- struct ggml_init_params params = {0, NULL, false};
- struct ggml_context* ctx = ggml_init(params);
- ggml_free(ctx);
- }
- // parse params
- if (parse_params(argc, argv, params)) {
- convert_model(params);
- }
- } else {
- print_usage(argc, argv);
- }
-}
\ No newline at end of file
diff --git a/format-code.sh b/format-code.sh
new file mode 100644
index 0000000..27e45a9
--- /dev/null
+++ b/format-code.sh
@@ -0,0 +1,2 @@
+clang-format -style=file -i *.cpp *.h
+clang-format -style=file -i examples/cli/*.cpp
\ No newline at end of file
diff --git a/model.cpp b/model.cpp
new file mode 100644
index 0000000..616a9f0
--- /dev/null
+++ b/model.cpp
@@ -0,0 +1,1312 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "model.h"
+#include "stable-diffusion.h"
+#include "util.h"
+#include "vocab.hpp"
+
+#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+#include "ggml/ggml.h"
+
+uint64_t read_u64(uint8_t* buffer) {
+ // little endian
+ uint64_t value = 0;
+ value |= static_cast(buffer[7]) << 56;
+ value |= static_cast(buffer[6]) << 48;
+ value |= static_cast(buffer[5]) << 40;
+ value |= static_cast(buffer[4]) << 32;
+ value |= static_cast(buffer[3]) << 24;
+ value |= static_cast(buffer[2]) << 16;
+ value |= static_cast(buffer[1]) << 8;
+ value |= static_cast(buffer[0]);
+ return value;
+}
+
+int32_t read_int(uint8_t* buffer) {
+ // little endian
+ int value = 0;
+ value |= buffer[3] << 24;
+ value |= buffer[2] << 16;
+ value |= buffer[1] << 8;
+ value |= buffer[0];
+ return value;
+}
+
+uint16_t read_short(uint8_t* buffer) {
+ // little endian
+ uint16_t value = 0;
+ value |= buffer[1] << 8;
+ value |= buffer[0];
+ return value;
+}
+
+/*================================================= Preprocess ==================================================*/
+
+std::string self_attn_names[] = {
+ "self_attn.q_proj.weight",
+ "self_attn.k_proj.weight",
+ "self_attn.v_proj.weight",
+
+ "self_attn.q_proj.bias",
+ "self_attn.k_proj.bias",
+ "self_attn.v_proj.bias",
+};
+
+const char* unused_tensors[] = {
+ "betas",
+ "alphas_cumprod_prev",
+ "sqrt_alphas_cumprod",
+ "sqrt_one_minus_alphas_cumprod",
+ "log_one_minus_alphas_cumprod",
+ "sqrt_recip_alphas_cumprod",
+ "sqrt_recipm1_alphas_cumprod",
+ "posterior_variance",
+ "posterior_log_variance_clipped",
+ "posterior_mean_coef1",
+ "posterior_mean_coef2",
+ "cond_stage_model.transformer.text_model.embeddings.position_ids",
+ "cond_stage_model.model.logit_scale",
+ "cond_stage_model.model.text_projection",
+ "model.diffusion_model.time_embedding.cond_proj.weight",
+ "unet.time_embedding.cond_proj.weight",
+ "model_ema.decay",
+ "model_ema.num_updates",
+ "model_ema.diffusion_model",
+ "control_model",
+ "embedding_manager",
+};
+
+bool is_unused_tensor(std::string name) {
+ for (int i = 0; i < sizeof(unused_tensors) / sizeof(const char*); i++) {
+ if (starts_with(name, unused_tensors[i])) {
+ return true;
+ }
+ }
+ return false;
+}
+
+std::unordered_map open_clip_to_hf_clip_model = {
+ {"cond_stage_model.model.ln_final.bias", "cond_stage_model.transformer.text_model.final_layer_norm.bias"},
+ {"cond_stage_model.model.ln_final.weight", "cond_stage_model.transformer.text_model.final_layer_norm.weight"},
+ {"cond_stage_model.model.positional_embedding", "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight"},
+ {"cond_stage_model.model.token_embedding.weight", "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight"},
+
+};
+
+std::unordered_map open_clip_to_hk_clip_resblock = {
+ {"attn.out_proj.bias", "self_attn.out_proj.bias"},
+ {"attn.out_proj.weight", "self_attn.out_proj.weight"},
+ {"ln_1.bias", "layer_norm1.bias"},
+ {"ln_1.weight", "layer_norm1.weight"},
+ {"ln_2.bias", "layer_norm2.bias"},
+ {"ln_2.weight", "layer_norm2.weight"},
+ {"mlp.c_fc.bias", "mlp.fc1.bias"},
+ {"mlp.c_fc.weight", "mlp.fc1.weight"},
+ {"mlp.c_proj.bias", "mlp.fc2.bias"},
+ {"mlp.c_proj.weight", "mlp.fc2.weight"},
+};
+
+std::unordered_map vae_decoder_name_map = {
+ {"first_stage_model.decoder.mid.attn_1.to_k.bias", "first_stage_model.decoder.mid.attn_1.k.bias"},
+ {"first_stage_model.decoder.mid.attn_1.to_k.weight", "first_stage_model.decoder.mid.attn_1.k.weight"},
+ {"first_stage_model.decoder.mid.attn_1.to_out.0.bias", "first_stage_model.decoder.mid.attn_1.proj_out.bias"},
+ {"first_stage_model.decoder.mid.attn_1.to_out.0.weight", "first_stage_model.decoder.mid.attn_1.proj_out.weight"},
+ {"first_stage_model.decoder.mid.attn_1.to_q.bias", "first_stage_model.decoder.mid.attn_1.q.bias"},
+ {"first_stage_model.decoder.mid.attn_1.to_q.weight", "first_stage_model.decoder.mid.attn_1.q.weight"},
+ {"first_stage_model.decoder.mid.attn_1.to_v.bias", "first_stage_model.decoder.mid.attn_1.v.bias"},
+ {"first_stage_model.decoder.mid.attn_1.to_v.weight", "first_stage_model.decoder.mid.attn_1.v.weight"},
+};
+
+std::string convert_open_clip_to_hf_clip(const std::string& name) {
+ std::string new_name = name;
+ std::string open_clip_resblock_prefix = "cond_stage_model.model.transformer.resblocks.";
+ std::string hf_clip_resblock_prefix = "cond_stage_model.transformer.text_model.encoder.layers.";
+
+ if (open_clip_to_hf_clip_model.find(name) != open_clip_to_hf_clip_model.end()) {
+ new_name = open_clip_to_hf_clip_model[name];
+ }
+
+ if (name.find(open_clip_resblock_prefix) == 0) {
+ std::string remain = name.substr(open_clip_resblock_prefix.length());
+ std::string idx = remain.substr(0, remain.find("."));
+ std::string suffix = remain.substr(idx.length() + 1);
+
+ if (suffix == "attn.in_proj_weight" || suffix == "attn.in_proj_bias") {
+ new_name = hf_clip_resblock_prefix + idx + "." + suffix;
+ } else if (open_clip_to_hk_clip_resblock.find(suffix) != open_clip_to_hk_clip_resblock.end()) {
+ std::string new_suffix = open_clip_to_hk_clip_resblock[suffix];
+ new_name = hf_clip_resblock_prefix + idx + "." + new_suffix;
+ }
+ }
+
+ return new_name;
+}
+
+std::string convert_vae_decoder_name(const std::string& name) {
+ if (vae_decoder_name_map.find(name) != vae_decoder_name_map.end()) {
+ return vae_decoder_name_map[name];
+ }
+ return name;
+}
+
+std::unordered_map> suffix_conversion_underline = {
+ {
+ "attentions",
+ {
+ {"to_k", "k"},
+ {"to_q", "q"},
+ {"to_v", "v"},
+ {"to_out_0", "proj_out"},
+ {"group_norm", "norm"},
+ },
+ },
+ {
+ "resnets",
+ {
+ {"conv1", "in_layers_2"},
+ {"conv2", "out_layers_3"},
+ {"norm1", "in_layers_0"},
+ {"norm2", "out_layers_0"},
+ {"time_emb_proj", "emb_layers_1"},
+ {"conv_shortcut", "skip_connection"},
+ },
+ },
+};
+
+std::unordered_map> suffix_conversion_dot = {
+ {
+ "attentions",
+ {
+ {"to_k", "k"},
+ {"to_q", "q"},
+ {"to_v", "v"},
+ {"to_out.0", "proj_out"},
+ {"group_norm", "norm"},
+ },
+ },
+ {
+ "resnets",
+ {
+ {"conv1", "in_layers.2"},
+ {"conv2", "out_layers.3"},
+ {"norm1", "in_layers.0"},
+ {"norm2", "out_layers.0"},
+ {"time_emb_proj", "emb_layers.1"},
+ {"conv_shortcut", "skip_connection"},
+ },
+ },
+};
+
+std::string convert_diffusers_name_to_compvis(const std::string& key, char seq) {
+ std::vector m;
+
+ auto match = [](std::vector& match_list, const std::regex& regex, const std::string& key) {
+ auto r = std::smatch{};
+ if (!std::regex_match(key, r, regex)) {
+ return false;
+ }
+
+ match_list.clear();
+ for (size_t i = 1; i < r.size(); ++i) {
+ match_list.push_back(r.str(i));
+ }
+ return true;
+ };
+
+ std::unordered_map> suffix_conversion;
+ if (seq == '_') {
+ suffix_conversion = suffix_conversion_underline;
+ } else {
+ suffix_conversion = suffix_conversion_dot;
+ }
+
+ auto get_converted_suffix = [&suffix_conversion](const std::string& outer_key, const std::string& inner_key) {
+ auto outer_iter = suffix_conversion.find(outer_key);
+ if (outer_iter != suffix_conversion.end()) {
+ auto inner_iter = outer_iter->second.find(inner_key);
+ if (inner_iter != outer_iter->second.end()) {
+ return inner_iter->second;
+ }
+ }
+ return inner_key;
+ };
+
+ // unet
+ if (match(m, std::regex(format("unet%cconv_in(.*)", seq)), key)) {
+ return format("model%cdiffusion_model%cinput_blocks%c0%c0", seq, seq, seq, seq) + m[0];
+ }
+
+ if (match(m, std::regex(format("unet%cconv%cout(.*)", seq, seq)), key)) {
+ return format("model%cdiffusion_model%cout%c2", seq, seq, seq) + m[0];
+ }
+
+ if (match(m, std::regex(format("unet%cconv_norm_out(.*)", seq)), key)) {
+ return format("model%cdiffusion_model%cout%c0", seq, seq, seq) + m[0];
+ }
+
+ if (match(m, std::regex(format("unet%ctime_embedding%clinear_(\\d+)(.*)", seq, seq)), key)) {
+ return format("model%cdiffusion_model%ctime_embed%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 2 - 2) + m[1];
+ }
+
+ if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
+ std::string suffix = get_converted_suffix(m[1], m[3]);
+ // LOG_DEBUG("%s %s %s %s", m[0].c_str(), m[1].c_str(), m[2].c_str(), m[3].c_str());
+ return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(1 + std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
+ (m[1] == "attentions" ? "1" : "0") + seq + suffix;
+ }
+
+ if (match(m, std::regex(format("unet%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq)), key)) {
+ std::string suffix = get_converted_suffix(m[0], m[2]);
+ return format("model%cdiffusion_model%cmiddle_block%c", seq, seq, seq) + (m[0] == "attentions" ? "1" : std::to_string(std::stoi(m[1]) * 2)) +
+ seq + suffix;
+ }
+
+ if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
+ std::string suffix = get_converted_suffix(m[1], m[3]);
+ return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(std::stoi(m[0]) * 3 + std::stoi(m[2])) + seq +
+ (m[1] == "attentions" ? "1" : "0") + seq + suffix;
+ }
+
+ if (match(m, std::regex(format("unet%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) {
+ return format("model%cdiffusion_model%cinput_blocks%c", seq, seq, seq) + std::to_string(3 + std::stoi(m[0]) * 3) + seq + "0" + seq + "op";
+ }
+
+ if (match(m, std::regex(format("unet%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq)), key)) {
+ return format("model%cdiffusion_model%coutput_blocks%c", seq, seq, seq) + std::to_string(2 + std::stoi(m[0]) * 3) + seq +
+ (std::stoi(m[0]) > 0 ? "2" : "1") + seq + "conv";
+ }
+
+ // clip
+ if (match(m, std::regex(format("te%ctext_model%cencoder%clayers%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
+ return format("cond_stage_model%ctransformer%ctext_model%cencoder%clayers%c", seq, seq, seq, seq, seq) + m[0] + seq + m[1];
+ }
+
+ if (match(m, std::regex(format("te%ctext_model(.*)", seq)), key)) {
+ return format("cond_stage_model%ctransformer%ctext_model", seq, seq) + m[0];
+ }
+
+ // vae
+ if (match(m, std::regex(format("vae%c(.*)%cconv_norm_out(.*)", seq, seq)), key)) {
+ return format("first_stage_model%c%s%cnorm_out%s", seq, m[0].c_str(), seq, m[1].c_str());
+ }
+
+ if (match(m, std::regex(format("vae%c(.*)%cmid_block%c(attentions|resnets)%c(\\d+)%c(.+)", seq, seq, seq, seq, seq)), key)) {
+ std::string suffix;
+ std::string block_name;
+ if (m[1] == "attentions") {
+ block_name = "attn";
+ suffix = get_converted_suffix(m[1], m[3]);
+ } else {
+ block_name = "block";
+ suffix = m[3];
+ }
+ return format("first_stage_model%c%s%cmid%c%s_%d%c%s",
+ seq, m[0].c_str(), seq, seq, block_name.c_str(), std::stoi(m[2]) + 1, seq, suffix.c_str());
+ }
+
+ if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
+ std::string suffix = m[3];
+ if (suffix == "conv_shortcut") {
+ suffix = "nin_shortcut";
+ }
+ return format("first_stage_model%c%s%cup%c%d%cblock%c%s%c%s",
+ seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str());
+ }
+
+ if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cdownsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) {
+ return format("first_stage_model%c%s%cdown%c%d%cdownsample%cconv",
+ seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq);
+ }
+
+ if (match(m, std::regex(format("vae%c(.*)%cdown_blocks%c(\\d+)%cresnets%c(\\d+)%c(.+)", seq, seq, seq, seq, seq, seq)), key)) {
+ std::string suffix = m[3];
+ if (suffix == "conv_shortcut") {
+ suffix = "nin_shortcut";
+ }
+ return format("first_stage_model%c%s%cdown%c%d%cblock%c%s%c%s",
+ seq, m[0].c_str(), seq, seq, std::stoi(m[1]), seq, seq, m[2].c_str(), seq, suffix.c_str());
+ }
+
+ if (match(m, std::regex(format("vae%c(.*)%cup_blocks%c(\\d+)%cupsamplers%c0%cconv", seq, seq, seq, seq, seq, seq)), key)) {
+ return format("first_stage_model%c%s%cup%c%d%cupsample%cconv",
+ seq, m[0].c_str(), seq, seq, 3 - std::stoi(m[1]), seq, seq);
+ }
+
+ if (match(m, std::regex(format("vae%c(.*)", seq)), key)) {
+ return format("first_stage_model%c", seq) + m[0];
+ }
+
+ return key;
+}
+
+std::string convert_tensor_name(const std::string& name) {
+ std::string new_name;
+ if (starts_with(name, "cond_stage_model.model")) {
+ new_name = convert_open_clip_to_hf_clip(name);
+ } else if (starts_with(name, "first_stage_model.decoder")) {
+ new_name = convert_vae_decoder_name(name);
+ } else if (starts_with(name, "lora_")) { // for lora
+ size_t pos = name.find('.');
+ if (pos != std::string::npos) {
+ std::string name_without_network_parts = name.substr(5, pos - 5);
+ std::string network_part = name.substr(pos + 1);
+ // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
+ std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '_');
+ if (new_key.empty()) {
+ new_name = name;
+ } else {
+ new_name = "lora." + new_key + "." + network_part;
+ }
+ } else {
+ new_name = name;
+ }
+ } else if (starts_with(name, "unet") || starts_with(name, "vae") || starts_with(name, "te")) { // for diffuser
+ size_t pos = name.find_last_of('.');
+ if (pos != std::string::npos) {
+ std::string name_without_network_parts = name.substr(0, pos);
+ std::string network_part = name.substr(pos + 1);
+ // LOG_DEBUG("%s %s", name_without_network_parts.c_str(), network_part.c_str());
+ std::string new_key = convert_diffusers_name_to_compvis(name_without_network_parts, '.');
+ if (new_key.empty()) {
+ new_name = name;
+ } else {
+ new_name = new_key + "." + network_part;
+ }
+ } else {
+ new_name = name;
+ }
+ } else {
+ new_name = name;
+ }
+ // if (new_name != name) {
+ // LOG_DEBUG("%s => %s", name.c_str(), new_name.c_str());
+ // }
+ return new_name;
+}
+
+void preprocess_tensor(TensorStorage tensor_storage,
+ std::vector& processed_tensor_storages) {
+ std::vector result;
+ std::string new_name = convert_tensor_name(tensor_storage.name);
+
+ // convert unet transformer linear to conv2d 1x1
+ if (starts_with(new_name, "model.diffusion_model.") &&
+ (ends_with(new_name, "proj_in.weight") || ends_with(new_name, "proj_out.weight"))) {
+ tensor_storage.unsqueeze();
+ }
+
+ // convert vae attn block linear to conv2d 1x1
+ if (starts_with(new_name, "first_stage_model.") && new_name.find("attn_1") != std::string::npos) {
+ tensor_storage.unsqueeze();
+ }
+
+ tensor_storage.name = new_name;
+
+ if (starts_with(new_name, "cond_stage_model.transformer.text_model.encoder.layers.") &&
+ ends_with(new_name, "attn.in_proj_weight")) {
+ size_t prefix_size = new_name.find("attn.in_proj_weight");
+ std::string prefix = new_name.substr(0, prefix_size);
+
+ std::vector chunks = tensor_storage.chunk(3);
+ chunks[0].name = prefix + "self_attn.q_proj.weight";
+ chunks[1].name = prefix + "self_attn.k_proj.weight";
+ chunks[2].name = prefix + "self_attn.v_proj.weight";
+
+ processed_tensor_storages.insert(processed_tensor_storages.end(), chunks.begin(), chunks.end());
+
+ } else if (starts_with(new_name, "cond_stage_model.transformer.text_model.encoder.layers.") &&
+ ends_with(new_name, "attn.in_proj_bias")) {
+ size_t prefix_size = new_name.find("attn.in_proj_bias");
+ std::string prefix = new_name.substr(0, prefix_size);
+
+ std::vector chunks = tensor_storage.chunk(3);
+ chunks[0].name = prefix + "self_attn.q_proj.bias";
+ chunks[1].name = prefix + "self_attn.k_proj.bias";
+ chunks[2].name = prefix + "self_attn.v_proj.bias";
+
+ processed_tensor_storages.insert(processed_tensor_storages.end(), chunks.begin(), chunks.end());
+ } else {
+ processed_tensor_storages.push_back(tensor_storage);
+ }
+}
+
+float bf16_to_f32(uint16_t bfloat16) {
+ uint32_t val_bits = (static_cast(bfloat16) << 16);
+ return *reinterpret_cast(&val_bits);
+}
+
+void bf16_to_f32_vec(uint16_t* src, float* dst, int64_t n) {
+ // support inplace op
+ for (int64_t i = n - 1; i >= 0; i--) {
+ dst[i] = bf16_to_f32(src[i]);
+ }
+}
+
+void convert_tensor(void* src, ggml_type src_type, void* dst, ggml_type dst_type, int n) {
+ if (src_type == dst_type) {
+ size_t nbytes = n * ggml_type_size(src_type) / ggml_blck_size(src_type);
+ memcpy(((char*)dst), ((char*)src), nbytes);
+ } else if (src_type == GGML_TYPE_F32) {
+ if (dst_type == GGML_TYPE_F16) {
+ ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
+ } else {
+ int64_t hist[16];
+ ggml_quantize_chunk(dst_type, (float*)src, dst, 0, n, hist);
+ }
+ } else if (dst_type == GGML_TYPE_F32) {
+ if (src_type == GGML_TYPE_F16) {
+ ggml_fp16_to_fp32_row((ggml_fp16_t*)src, (float*)dst, n);
+ } else {
+ auto qtype = ggml_internal_get_type_traits(src_type);
+ if (qtype.to_float == NULL) {
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available",
+ ggml_type_name(src_type)));
+ }
+ qtype.to_float(src, (float*)dst, n);
+ }
+ } else {
+ // src_type == GGML_TYPE_F16 => dst_type is quantized
+ // src_type is quantized => dst_type == GGML_TYPE_F16 or dst_type is quantized
+ auto qtype = ggml_internal_get_type_traits(src_type);
+ if (qtype.to_float == NULL) {
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available",
+ ggml_type_name(src_type)));
+ }
+ std::vector buf;
+ buf.resize(sizeof(float) * n);
+ char* src_data_f32 = buf.data();
+ qtype.to_float(src, (float*)src_data_f32, n);
+ if (dst_type == GGML_TYPE_F16) {
+ ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
+ } else {
+ int64_t hist[16];
+ ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, n, hist);
+ }
+ }
+}
+
+/*================================================= ModelLoader ==================================================*/
+
+// ported from https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py#L16
+std::map unicode_to_byte() {
+ std::map byte_to_unicode;
+
+ // List of utf-8 byte ranges
+ for (int b = static_cast('!'); b <= static_cast('~'); ++b) {
+ byte_to_unicode[b] = static_cast(b);
+ }
+
+ for (int b = 49825; b <= 49836; ++b) {
+ byte_to_unicode[b] = static_cast(b);
+ }
+
+ for (int b = 49838; b <= 50111; ++b) {
+ byte_to_unicode[b] = static_cast(b);
+ }
+ // printf("%d %d %d %d\n", static_cast('¡'), static_cast('¬'), static_cast('®'), static_cast('ÿ'));
+ // exit(1);
+
+ int n = 0;
+ for (int b = 0; b < 256; ++b) {
+ if (byte_to_unicode.find(b) == byte_to_unicode.end()) {
+ byte_to_unicode[b] = static_cast(256 + n);
+ n++;
+ }
+ }
+
+ // byte_encoder = bytes_to_unicode()
+ // byte_decoder = {v: k for k, v in byte_encoder.items()}
+ std::map byte_decoder;
+
+ for (const auto& entry : byte_to_unicode) {
+ byte_decoder[entry.second] = entry.first;
+ }
+
+ byte_to_unicode.clear();
+
+ return byte_decoder;
+}
+
+bool ModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) {
+ file_paths_.push_back(file_path);
+ return true;
+}
+
+bool ModelLoader::init_from_files(const std::vector& file_paths) {
+ for (auto& file_path : file_paths) {
+ if (!init_from_file(file_path)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+SDVersion ModelLoader::get_sd_version() {
+ TensorStorage token_embedding_weight;
+ for (auto& tensor_storage : tensor_storages) {
+ if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
+ tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
+ tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
+ tensor_storage.name == "te.text_model.embeddings.token_embedding.weight") {
+ token_embedding_weight = tensor_storage;
+ break;
+ }
+ }
+ if (token_embedding_weight.ne[0] == 768) {
+ return VERSION_1_x;
+ } else if (token_embedding_weight.ne[0] == 1024) {
+ return VERSION_2_x;
+ }
+ return VERSION_COUNT;
+}
+
+ggml_type ModelLoader::get_sd_wtype() {
+ for (auto& tensor_storage : tensor_storages) {
+ if (is_unused_tensor(tensor_storage.name)) {
+ continue;
+ }
+
+ if (tensor_storage.name.find(".weight") != std::string::npos &&
+ tensor_storage.name.find("time_embed") != std::string::npos) {
+ return tensor_storage.type;
+ }
+ }
+ return GGML_TYPE_COUNT;
+}
+
+bool ModelLoader::load_vocab(on_new_token_cb_t on_new_token_cb) {
+ char* vocab_buffer = reinterpret_cast(vocab_json);
+ nlohmann::json vocab = nlohmann::json::parse(vocab_buffer);
+ std::map decoder = unicode_to_byte();
+ for (auto& it : vocab.items()) {
+ int token_id = it.value();
+ std::string token_str = it.key();
+ std::string token = "";
+ for (char c : token_str) {
+ token += decoder[c];
+ }
+ on_new_token_cb(token, token_id);
+ }
+ return true;
+}
+
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
+ bool success = true;
+ for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
+ std::string file_path = file_paths_[file_index];
+ LOG_DEBUG("loading tensors from %s", file_path.c_str());
+
+ std::ifstream file(file_path, std::ios::binary);
+ if (!file.is_open()) {
+ LOG_ERROR("failed to open '%s'", file_path.c_str());
+ return false;
+ }
+
+ bool is_zip = false;
+ for (auto& tensor_storage : tensor_storages) {
+ if (tensor_storage.index_in_zip >= 0) {
+ is_zip = true;
+ break;
+ }
+ }
+
+ struct zip_t* zip = NULL;
+ if (is_zip) {
+ zip = zip_open(file_path.c_str(), 0, 'r');
+ if (zip == NULL) {
+ LOG_ERROR("failed to open zip '%s'", file_path.c_str());
+ return false;
+ }
+ }
+
+ std::vector read_buffer;
+ std::vector convert_buffer;
+
+ auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) {
+ if (zip != NULL) {
+ zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
+ size_t entry_size = zip_entry_size(zip);
+ if (entry_size != n) {
+ read_buffer.resize(entry_size);
+ zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
+ memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
+ } else {
+ zip_entry_noallocread(zip, (void*)buf, n);
+ }
+ zip_entry_close(zip);
+ } else {
+ file.seekg(tensor_storage.offset);
+ file.read(buf, n);
+ if (!file) {
+ LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
+ return false;
+ }
+ }
+ return true;
+ };
+
+ std::vector processed_tensor_storages;
+ for (auto& tensor_storage : tensor_storages) {
+ if (tensor_storage.file_index != file_index) {
+ continue;
+ }
+
+ // LOG_DEBUG("%s", name.c_str());
+
+ if (is_unused_tensor(tensor_storage.name)) {
+ continue;
+ }
+
+ preprocess_tensor(tensor_storage, processed_tensor_storages);
+ }
+
+ for (auto& tensor_storage : processed_tensor_storages) {
+ // LOG_DEBUG("%s", name.c_str());
+
+ ggml_tensor* dst_tensor = NULL;
+
+ success = on_new_tensor_cb(tensor_storage, &dst_tensor);
+ if (!success) {
+ LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
+ break;
+ }
+
+ if (dst_tensor == NULL) {
+ continue;
+ }
+
+ ggml_backend_t backend = ggml_get_backend(dst_tensor);
+
+ size_t nbytes_to_read = tensor_storage.nbytes_to_read();
+
+ if (backend == NULL || ggml_backend_is_cpu(backend)) {
+ // for the CPU and Metal backend, we can copy directly into the tensor
+ if (tensor_storage.type == dst_tensor->type) {
+ GGML_ASSERT(ggml_nbytes(dst_tensor) == nbytes_to_read);
+ read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
+
+ if (tensor_storage.is_bf16) {
+ // inplace op
+ bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
+ }
+ } else {
+ read_buffer.resize(tensor_storage.nbytes());
+ read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
+
+ if (tensor_storage.is_bf16) {
+ // inplace op
+ bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+ }
+
+ convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
+ dst_tensor->type, (int)tensor_storage.nelements());
+ }
+ } else {
+ read_buffer.resize(tensor_storage.nbytes());
+ read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
+
+ if (tensor_storage.is_bf16) {
+ // inplace op
+ bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+ }
+
+ if (tensor_storage.type == dst_tensor->type) {
+ // copy to device memory
+ ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
+ } else {
+ // convert first, then copy to device memory
+ convert_buffer.resize(ggml_nbytes(dst_tensor));
+ convert_tensor((void*)read_buffer.data(), tensor_storage.type,
+ (void*)convert_buffer.data(), dst_tensor->type,
+ (int)tensor_storage.nelements());
+ ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
+ }
+ }
+ }
+
+ if (zip != NULL) {
+ zip_close(zip);
+ }
+
+ if (!success) {
+ break;
+ }
+ }
+ return success;
+}
+
+int64_t ModelLoader::cal_mem_size() {
+ int64_t mem_size = 0;
+ for (auto& tensor_storage : tensor_storages) {
+ if (is_unused_tensor(tensor_storage.name)) {
+ continue;
+ }
+
+ mem_size += tensor_storage.nbytes();
+ mem_size += GGML_MEM_ALIGN * 2; // for lora alphas
+ }
+
+ return mem_size + 10 * 1024 * 1024;
+}
+
+/*================================================= GGUFModelLoader ==================================================*/
+
+bool GGUFModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) {
+ LOG_INFO("loading model from '%s'", file_path.c_str());
+ ModelLoader::init_from_file(file_path, prefix);
+ size_t file_index = file_paths_.size() - 1;
+
+ gguf_context* ctx_gguf_ = NULL;
+ ggml_context* ctx_meta_ = NULL;
+ ctx_gguf_ = gguf_init_from_file(file_path.c_str(), {true, &ctx_meta_});
+ if (!ctx_gguf_) {
+ LOG_ERROR("failed to open '%s'", file_path.c_str());
+ return false;
+ }
+
+ int n_tensors = gguf_get_n_tensors(ctx_gguf_);
+
+ size_t total_size = 0;
+ size_t data_offset = gguf_get_data_offset(ctx_gguf_);
+ for (int i = 0; i < n_tensors; i++) {
+ std::string name = gguf_get_tensor_name(ctx_gguf_, i);
+ struct ggml_tensor* dummy = ggml_get_tensor(ctx_meta_, name.c_str());
+ size_t offset = data_offset + gguf_get_tensor_offset(ctx_gguf_, i);
+
+ // LOG_DEBUG("%s", name.c_str());
+
+ TensorStorage tensor_storage(prefix + name, dummy->type, dummy->ne, dummy->n_dims, file_index, offset);
+
+ GGML_ASSERT(ggml_nbytes(dummy) == tensor_storage.nbytes());
+
+ tensor_storages.push_back(tensor_storage);
+ }
+
+ gguf_free(ctx_gguf_);
+ ggml_free(ctx_meta_);
+
+ return true;
+}
+
+/*================================================= SafeTensorsModelLoader ==================================================*/
+
+#define ST_HEADER_SIZE_LEN 8
+
+ggml_type str_to_ggml_type(const std::string& dtype) {
+ ggml_type ttype = GGML_TYPE_COUNT;
+ if (dtype == "F16") {
+ ttype = GGML_TYPE_F16;
+ } else if (dtype == "BF16") {
+ ttype = GGML_TYPE_F32;
+ } else if (dtype == "F32") {
+ ttype = GGML_TYPE_F32;
+ }
+ return ttype;
+}
+
+// https://huggingface.co/docs/safetensors/index
+bool SafeTensorsModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) {
+ ModelLoader::init_from_file(file_path, prefix);
+ size_t file_index = file_paths_.size() - 1;
+ std::ifstream file(file_path, std::ios::binary);
+ if (!file.is_open()) {
+ LOG_ERROR("failed to open '%s'", file_path.c_str());
+ return false;
+ }
+
+ // get file size
+ file.seekg(0, file.end);
+ size_t file_size_ = file.tellg();
+ file.seekg(0, file.beg);
+
+ // read header size
+ if (file_size_ <= ST_HEADER_SIZE_LEN) {
+ LOG_ERROR("invalid safetensor file '%s'", file_path.c_str());
+ return false;
+ }
+
+ uint8_t header_size_buf[ST_HEADER_SIZE_LEN];
+ file.read((char*)header_size_buf, ST_HEADER_SIZE_LEN);
+ if (!file) {
+ LOG_ERROR("read safetensors header size failed: '%s'", file_path.c_str());
+ return false;
+ }
+
+ size_t header_size_ = read_u64(header_size_buf);
+ if (header_size_ >= file_size_) {
+ LOG_ERROR("invalid safetensor file '%s'", file_path.c_str());
+ return false;
+ }
+
+ // read header
+ std::vector header_buf;
+ header_buf.resize(header_size_ + 1);
+ header_buf[header_size_] = '\0';
+ file.read(header_buf.data(), header_size_);
+ if (!file) {
+ LOG_ERROR("read safetensors header failed: '%s'", file_path.c_str());
+ return false;
+ }
+
+ nlohmann::json header_ = nlohmann::json::parse(header_buf.data());
+
+ for (auto& item : header_.items()) {
+ std::string name = item.key();
+ nlohmann::json tensor_info = item.value();
+ // LOG_DEBUG("%s %s\n", name.c_str(), tensor_info.dump().c_str());
+
+ if (name == "__metadata__") {
+ continue;
+ }
+
+ if (is_unused_tensor(name)) {
+ continue;
+ }
+
+ std::string dtype = tensor_info["dtype"];
+ nlohmann::json shape = tensor_info["shape"];
+
+ size_t begin = tensor_info["data_offsets"][0].get();
+ size_t end = tensor_info["data_offsets"][1].get();
+
+ ggml_type type = str_to_ggml_type(dtype);
+ if (type == GGML_TYPE_COUNT) {
+ LOG_ERROR("unsupported dtype '%s'", dtype.c_str());
+ return false;
+ }
+
+ if (shape.size() > 4) {
+ LOG_ERROR("invalid tensor '%s'", name.c_str());
+ return false;
+ }
+
+ int n_dims = (int)shape.size();
+ int64_t ne[4] = {1, 1, 1, 1};
+ for (int i = 0; i < n_dims; i++) {
+ ne[i] = shape[i].get();
+ }
+
+ TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
+
+ tensor_storage.reverse_ne();
+
+ size_t tensor_data_size = end - begin;
+
+ if (dtype == "BF16") {
+ tensor_storage.is_bf16 = true;
+ GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size * 2);
+ } else {
+ GGML_ASSERT(tensor_storage.nbytes() == tensor_data_size);
+ }
+
+ tensor_storages.push_back(tensor_storage);
+ }
+
+ return true;
+}
+
+/*================================================= DiffusersModelLoader ==================================================*/
+
+bool DiffusersModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) {
+ if (!is_directory(file_path)) {
+ return SafeTensorsModelLoader::init_from_file(file_path, prefix);
+ }
+ std::string unet_path = path_join(file_path, "unet/diffusion_pytorch_model.safetensors");
+ std::string vae_path = path_join(file_path, "vae/diffusion_pytorch_model.safetensors");
+ std::string clip_path = path_join(file_path, "text_encoder/model.safetensors");
+
+ if (!SafeTensorsModelLoader::init_from_file(unet_path, "unet.")) {
+ return false;
+ }
+ if (!SafeTensorsModelLoader::init_from_file(vae_path, "vae.")) {
+ return false;
+ }
+ if (!SafeTensorsModelLoader::init_from_file(clip_path, "te.")) {
+ return false;
+ }
+ return true;
+}
+
+/*================================================= CkptModelLoader ==================================================*/
+
+// $ python -m pickletools sd-v1-4/archive/data.pkl | head -n 100
+// 0: \x80 PROTO 2
+// 2: } EMPTY_DICT
+// 3: q BINPUT 0
+// 5: ( MARK
+// 6: X BINUNICODE 'epoch'
+// 16: q BINPUT 1
+// 18: K BININT1 6
+// 20: X BINUNICODE 'global_step'
+// 36: q BINPUT 2
+// 38: J BININT 470000
+// 43: X BINUNICODE 'pytorch-lightning_version'
+// 73: q BINPUT 3
+// 75: X BINUNICODE '1.4.2'
+// 85: q BINPUT 4
+// 87: X BINUNICODE 'state_dict'
+// 102: q BINPUT 5
+// 104: } EMPTY_DICT
+// 105: q BINPUT 6
+// 107: ( MARK
+// 108: X BINUNICODE 'betas'
+// 118: q BINPUT 7
+// 120: c GLOBAL 'torch._utils _rebuild_tensor_v2'
+// 153: q BINPUT 8
+// 155: ( MARK
+// 156: ( MARK
+// 157: X BINUNICODE 'storage'
+// 169: q BINPUT 9
+// 171: c GLOBAL 'torch FloatStorage'
+// 191: q BINPUT 10
+// 193: X BINUNICODE '0'
+// 199: q BINPUT 11
+// 201: X BINUNICODE 'cpu'
+// 209: q BINPUT 12
+// 211: M BININT2 1000
+// 214: t TUPLE (MARK at 156)
+// 215: q BINPUT 13
+// 217: Q BINPERSID
+// 218: K BININT1 0
+// 220: M BININT2 1000
+// ...............................
+// 3201: q BINPUT 250
+// 3203: R REDUCE
+// 3204: q BINPUT 251
+// 3206: X BINUNICODE 'model.diffusion_model.input_blocks.1.1.proj_in.weight'
+// 3264: q BINPUT 252
+// 3266: h BINGET 8
+// 3268: ( MARK
+// 3269: ( MARK
+// 3270: h BINGET 9
+// 3272: h BINGET 10
+// 3274: X BINUNICODE '30'
+// 3281: q BINPUT 253
+// 3283: h BINGET 12
+// 3285: J BININT 102400
+// 3290: t TUPLE (MARK at 3269)
+// 3291: q BINPUT 254
+// 3293: Q BINPERSID
+// 3294: K BININT1 0
+// 3296: ( MARK
+// 3297: M BININT2 320
+// 3300: M BININT2 320
+// 3303: K BININT1 1
+// 3305: K BININT1 1
+// 3307: t TUPLE (MARK at 3296)
+// 3308: q BINPUT 255
+// 3310: ( MARK
+// 3311: M BININT2 320
+// 3314: K BININT1 1
+// 3316: K BININT1 1
+// 3318: K BININT1 1
+// 3320: t TUPLE (MARK at 3310)
+// 3321: r LONG_BINPUT 256
+// 3326: \x89 NEWFALSE
+// 3327: h BINGET 16
+// 3329: ) EMPTY_TUPLE
+// 3330: R REDUCE
+// 3331: r LONG_BINPUT 257
+// 3336: t TUPLE (MARK at 3268)
+// 3337: r LONG_BINPUT 258
+// 3342: R REDUCE
+// 3343: r LONG_BINPUT 259
+// 3348: X BINUNICODE 'model.diffusion_model.input_blocks.1.1.proj_in.bias'
+// 3404: r LONG_BINPUT 260
+// 3409: h BINGET 8
+// 3411: ( MARK
+// 3412: ( MARK
+// 3413: h BINGET 9
+// 3415: h BINGET 10
+// 3417: X BINUNICODE '31'
+
+struct PickleTensorReader {
+ enum ReadPhase {
+ READ_NAME,
+ READ_DATA,
+ CHECK_SIZE,
+ READ_DIMENS
+ };
+ ReadPhase phase = READ_NAME;
+ size_t entry_size = 0;
+ int32_t nelements = 0;
+
+ TensorStorage tensor_storage;
+
+ static ggml_type global_type; // all pickle_tensors data type
+ static bool read_global_type;
+
+ bool read_int_value(uint32_t value) {
+ if (phase == CHECK_SIZE) {
+ if (entry_size == value * ggml_type_size(tensor_storage.type)) {
+ nelements = value;
+ phase = READ_DIMENS;
+ return true;
+ } else {
+ phase = READ_NAME;
+ }
+ } else if (phase == READ_DIMENS) {
+ if (tensor_storage.n_dims + 1 > 4) { // too many dimens
+ phase = READ_NAME;
+ tensor_storage.n_dims = 0;
+ }
+ if (nelements % value == 0) {
+ tensor_storage.ne[tensor_storage.n_dims] = value;
+ tensor_storage.n_dims++;
+ }
+ }
+ return false;
+ }
+
+ void read_global(const std::string& str) {
+ if (str == "FloatStorage") {
+ if (read_global_type) {
+ global_type = GGML_TYPE_F32;
+ read_global_type = false;
+ }
+ tensor_storage.type = GGML_TYPE_F32;
+ } else if (str == "HalfStorage") {
+ if (read_global_type) {
+ global_type = GGML_TYPE_F16;
+ read_global_type = false;
+ }
+ tensor_storage.type = GGML_TYPE_F16;
+ }
+ }
+
+ void read_string(const std::string& str, struct zip_t* zip, std::string dir) {
+ if (str == "storage") {
+ read_global_type = true;
+ } else if (str != "state_dict") {
+ if (phase == READ_DATA) {
+ std::string entry_name = dir + "data/" + std::string(str);
+
+ size_t i, n = zip_entries_total(zip);
+ for (i = 0; i < n; ++i) {
+ zip_entry_openbyindex(zip, i);
+ {
+ std::string name = zip_entry_name(zip);
+ if (name == entry_name) {
+ tensor_storage.index_in_zip = (int)i;
+ entry_size = zip_entry_size(zip);
+ zip_entry_close(zip);
+ break;
+ }
+ }
+ zip_entry_close(zip);
+ }
+
+ phase = entry_size > 0 ? CHECK_SIZE : READ_NAME;
+ }
+ if (!read_global_type && phase == READ_NAME) {
+ tensor_storage.name = str;
+ phase = READ_DATA;
+ tensor_storage.type = global_type;
+ }
+ }
+ }
+};
+
+ggml_type PickleTensorReader::global_type = GGML_TYPE_F32; // all pickle_tensors data type
+bool PickleTensorReader::read_global_type = false;
+
+int find_char(uint8_t* buffer, int len, char c) {
+ for (int pos = 0; pos < len; pos++) {
+ if (buffer[pos] == c) {
+ return pos;
+ }
+ }
+ return -1;
+}
+
+#define MAX_STRING_BUFFER 512
+
+bool CkptModelLoader::parse_data_pkl(uint8_t* buffer,
+ size_t buffer_size,
+ zip_t* zip,
+ std::string dir,
+ size_t file_index,
+ const std::string& prefix) {
+ uint8_t* buffer_end = buffer + buffer_size;
+ if (buffer[0] == 0x80) { // proto
+ if (buffer[1] != 2) {
+ LOG_ERROR("Unsupported protocol\n");
+ return false;
+ }
+ buffer += 2; // 0x80 and version
+ char string_buffer[MAX_STRING_BUFFER];
+ bool finish = false;
+ PickleTensorReader reader;
+ // read pickle binary file
+ while (!finish && buffer < buffer_end) {
+ uint8_t opcode = *buffer;
+ buffer++;
+ // https://github.com/python/cpython/blob/3.7/Lib/pickletools.py#L1048
+ // https://github.com/python/cpython/blob/main/Lib/pickle.py#L105
+ switch (opcode) {
+ case '}': // EMPTY_DICT = b'}' # push empty dict
+ break;
+ case ']': // EMPTY_LIST = b']' # push empty list
+ break;
+ // skip unused sections
+ case 'h': // BINGET = b'h' # " " " " " " ; " " 1-byte arg
+ case 'q': // BINPUT = b'q' # " " " " " ; " " 1-byte arg
+ case 'Q': // BINPERSID = b'Q' # " " " ; " " " " stack
+ buffer++;
+ break;
+ case 'r': // LONG_BINPUT = b'r' # " " " " " ; " " 4-byte arg
+ buffer += 4;
+ break;
+ case 0x95: // FRAME = b'\x95' # indicate the beginning of a new frame
+ buffer += 8;
+ break;
+ case 0x94: // MEMOIZE = b'\x94' # store top of the stack in memo
+ break;
+ case '(': // MARK = b'(' # push special markobject on stack
+ break;
+ case 'K': // BININT1 = b'K' # push 1-byte unsigned int
+ {
+ uint8_t value = *buffer;
+ if (reader.read_int_value(value)) {
+ buffer++;
+ }
+ buffer++;
+ } break;
+ case 'M': // BININT2 = b'M' # push 2-byte unsigned int
+ {
+ uint16_t value = read_short(buffer);
+ if (reader.read_int_value(value)) {
+ buffer++;
+ }
+ buffer += 2;
+ } break;
+ case 'J': // BININT = b'J' # push four-byte signed int
+ {
+ const int32_t value = read_int(buffer);
+ if (reader.read_int_value(value)) {
+ buffer++; // skip tuple after read num_elements
+ }
+ buffer += 4;
+ } break;
+ case 'X': // BINUNICODE = b'X' # " " " ; counted UTF-8 string argument
+ {
+ const int32_t len = read_int(buffer);
+ buffer += 4;
+ memset(string_buffer, 0, MAX_STRING_BUFFER);
+ if (len > MAX_STRING_BUFFER) {
+ LOG_WARN("tensor name very large");
+ }
+ memcpy(string_buffer, buffer, len < MAX_STRING_BUFFER ? len : (MAX_STRING_BUFFER - 1));
+ buffer += len;
+ reader.read_string(string_buffer, zip, dir);
+ } break;
+ case 0x8C: // SHORT_BINUNICODE = b'\x8c' # push short string; UTF-8 length < 256 bytes
+ {
+ const int8_t len = *buffer;
+ buffer++;
+ memset(string_buffer, 0, MAX_STRING_BUFFER);
+ memcpy(string_buffer, buffer, len);
+ buffer += len;
+ // printf("String: '%s'\n", string_buffer);
+ } break;
+ case 'c': // GLOBAL = b'c' # push self.find_class(modname, name); 2 string args
+ {
+ int len = find_char(buffer, MAX_STRING_BUFFER, '\n');
+
+ buffer += len + 1;
+ len = find_char(buffer, MAX_STRING_BUFFER, '\n');
+
+ memset(string_buffer, 0, MAX_STRING_BUFFER);
+ memcpy(string_buffer, buffer, len);
+ buffer += len + 1;
+ reader.read_global(string_buffer);
+ } break;
+ case 0x86: // TUPLE2 = b'\x86' # build 2-tuple from two topmost stack items
+ case 0x85: // TUPLE1 = b'\x85' # build 1-tuple from stack top
+ case 't': // TUPLE = b't' # build tuple from topmost stack items
+ if (reader.phase == PickleTensorReader::READ_DIMENS) {
+ reader.tensor_storage.reverse_ne();
+ reader.tensor_storage.file_index = file_index;
+ reader.tensor_storage.name = prefix + reader.tensor_storage.name;
+ tensor_storages.push_back(reader.tensor_storage);
+ // reset
+ reader = PickleTensorReader();
+ }
+ break;
+ case '.': // STOP = b'.' # every pickle ends with STOP
+ finish = true;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ return true;
+}
+
+bool CkptModelLoader::init_from_file(const std::string& file_path, const std::string& prefix) {
+ ModelLoader::init_from_file(file_path, prefix);
+ size_t file_index = file_paths_.size() - 1;
+
+ struct zip_t* zip = zip_open(file_path.c_str(), 0, 'r');
+ if (zip == NULL) {
+ LOG_ERROR("failed to open '%s'", file_path.c_str());
+ return false;
+ }
+ int n = (int)zip_entries_total(zip);
+ for (int i = 0; i < n; ++i) {
+ zip_entry_openbyindex(zip, i);
+ {
+ std::string name = zip_entry_name(zip);
+ size_t pos = name.find("data.pkl");
+ if (pos != std::string::npos) {
+ std::string dir = name.substr(0, pos);
+ void* pkl_data = NULL;
+ size_t pkl_size;
+ zip_entry_read(zip, &pkl_data, &pkl_size);
+
+ LOG_DEBUG("%lld", pkl_size);
+
+ parse_data_pkl((uint8_t*)pkl_data, pkl_size, zip, dir, file_index, prefix);
+
+ free(pkl_data);
+ }
+ }
+ zip_entry_close(zip);
+ }
+ zip_close(zip);
+ return true;
+}
+
+/*================================================= init_model_loader_from_file ==================================================*/
+
+ModelLoader* init_model_loader_from_file(const std::string& file_path) {
+ ModelLoader* model_loader = NULL;
+ if (is_directory(file_path)) {
+ LOG_DEBUG("load %s using diffusers format", file_path.c_str());
+ model_loader = new DiffusersModelLoader();
+ } else if (ends_with(file_path, ".gguf")) {
+ LOG_DEBUG("load %s using gguf format", file_path.c_str());
+ model_loader = new GGUFModelLoader();
+ } else if (ends_with(file_path, ".safetensors")) {
+ LOG_DEBUG("load %s using safetensors format", file_path.c_str());
+ model_loader = new SafeTensorsModelLoader();
+ } else if (ends_with(file_path, ".ckpt")) {
+ LOG_DEBUG("load %s using checkpoint format", file_path.c_str());
+ model_loader = new CkptModelLoader();
+ } else {
+ LOG_DEBUG("unknown format %s", file_path.c_str());
+ return NULL;
+ }
+ if (!model_loader->init_from_file(file_path)) {
+ delete model_loader;
+ model_loader = NULL;
+ }
+ return model_loader;
+}
\ No newline at end of file
diff --git a/model.h b/model.h
new file mode 100644
index 0000000..3012d80
--- /dev/null
+++ b/model.h
@@ -0,0 +1,142 @@
+#ifndef __MODEL_H__
+#define __MODEL_H__
+
+#include
+#include