perf: free unused params immediately to reduce memory usage

2023-08-17 00:55:36 +08:00
parent cbee3c9a4f
commit 8f34dd7cc7
4 changed files with 232 additions and 73 deletions
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
 - 16-bit, 32-bit float support
 - 4-bit, 5-bit and 8-bit integer quantization support
 - Accelerated memory-efficient CPU inference
    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Original `txt2img` and `img2img` mode
 - Negative prompt
@@ -152,8 +153,8 @@ Using formats of different precisions will yield results of varying quality.
 | precision | f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
 | ----         | ----  |----  |----  |----  |----  |----  |----  |
-|  **Disk**        | 2.8G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
+|  **Disk**        | 2.7G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
-|  **Memory**(txt2img - 512 x 512) | ~4.9G | ~4.1G | ~3.8G | ~3.7G | ~3.7G | ~3.6G | ~3.6G |
+|  **Memory**(txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
 ## References
--- a/main.cpp
+++ b/main.cpp
@@ -322,7 +322,8 @@ int main(int argc, const char* argv[]) {
        }
        init_img.assign(img_data, img_data + (opt.w * opt.h * c));
    }
-    StableDiffusion sd(opt.n_threads, vae_decode_only);
+
    StableDiffusion sd(opt.n_threads, vae_decode_only, true);
    if (!sd.load_from_file(opt.model_path)) {
        return 1;
    }
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1186,7 +1186,9 @@ struct DownSample {
    static void asymmetric_pad(struct ggml_tensor* dst,
                               const struct ggml_tensor* a,
                               const struct ggml_tensor* b,
-                               int ith, int nth, void * userdata) {
+                               int ith,
                               int nth,
                               void* userdata) {
        assert(sizeof(dst->nb[0]) == sizeof(float));
        assert(sizeof(a->nb[0]) == sizeof(float));
        assert(sizeof(b->nb[0]) == sizeof(float));
@@ -1450,6 +1452,8 @@ struct UNetModel {
        mem_size += out_channels * model_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // out_2_w
        mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                           // out_2_b
        mem_size += 4 * ggml_tensor_overhead();
        return static_cast<size_t>(mem_size);
    }
@@ -2470,13 +2474,20 @@ struct CompVisDenoiser {
 class StableDiffusionGGML {
   public:
-    ggml_context* params_ctx = NULL;
+    ggml_context* clip_params_ctx = NULL;
    ggml_context* unet_params_ctx = NULL;
    ggml_context* vae_params_ctx = NULL;
    bool dynamic = true;
-    bool vae_decode_only = true;
+    bool vae_decode_only = false;
    bool free_params_immediately = false;
    int32_t ftype = 1;
    int n_threads = -1;
    float scale_factor = 0.18215f;
-    size_t max_rt_size = 0;
+    size_t max_mem_size = 0;
    size_t curr_params_mem_size = 0;
    size_t max_params_mem_size = 0;
    size_t max_rt_mem_size = 0;
    FrozenCLIPEmbedder cond_stage_model;
    UNetModel diffusion_model;
@@ -2484,19 +2495,29 @@ class StableDiffusionGGML {
    CompVisDenoiser denoiser;
    std::map<std::string, struct ggml_tensor*> tensors;
    StableDiffusionGGML() = default;
-    StableDiffusionGGML(int n_threads, bool vae_decode_only)
+    StableDiffusionGGML(int n_threads,
-        : n_threads(n_threads), vae_decode_only(vae_decode_only) {
+                        bool vae_decode_only,
                        bool free_params_immediately)
        : n_threads(n_threads),
          vae_decode_only(vae_decode_only),
          free_params_immediately(free_params_immediately) {
        first_stage_model.decode_only = vae_decode_only;
    }
    ~StableDiffusionGGML() {
-        if (params_ctx != NULL) {
+        if (clip_params_ctx != NULL) {
-            ggml_free(params_ctx);
+            ggml_free(clip_params_ctx);
-            params_ctx = NULL;
+            clip_params_ctx = NULL;
        }
        if (unet_params_ctx != NULL) {
            ggml_free(unet_params_ctx);
            unet_params_ctx = NULL;
        }
        if (vae_params_ctx != NULL) {
            ggml_free(vae_params_ctx);
            vae_params_ctx = NULL;
        }
    }
@@ -2559,50 +2580,89 @@ class StableDiffusionGGML {
            }
        }
-        double ctx_size = 0;
+        // create the ggml context for network params
        LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
        {
            // cond_stage_model(FrozenCLIPEmbedder)
            double ctx_size = 1 * 1024 * 1024;  // 1 MB, for padding
            ctx_size += cond_stage_model.text_model.compute_params_mem_size(wtype);
            LOG_DEBUG("clip params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
            // diffusion_model(UNetModel)
            ctx_size += diffusion_model.compute_params_mem_size(wtype);
            // first_stage_model(AutoEncoderKL)
            ctx_size += first_stage_model.compute_params_mem_size(wtype);
            LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
            LOG_INFO("params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
        }
        // create the ggml context for network params
        {
            struct ggml_init_params params;
            params.mem_size = static_cast<size_t>(ctx_size);
            params.mem_buffer = NULL;
            params.no_alloc = false;
            params.dynamic = false;
-            params_ctx = ggml_init(params);
+            clip_params_ctx = ggml_init(params);
-            if (!params_ctx) {
+            if (!clip_params_ctx) {
                LOG_ERROR("ggml_init() failed");
                return false;
            }
        }
        {
            // diffusion_model(UNetModel)
            double ctx_size = 1 * 1024 * 1024;  // 1 MB, for padding
            ctx_size += diffusion_model.compute_params_mem_size(wtype);
            LOG_DEBUG("unet params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
            struct ggml_init_params params;
            params.mem_size = static_cast<size_t>(ctx_size);
            params.mem_buffer = NULL;
            params.no_alloc = false;
            params.dynamic = false;
            unet_params_ctx = ggml_init(params);
            if (!unet_params_ctx) {
                LOG_ERROR("ggml_init() failed");
                ggml_free(clip_params_ctx);
                clip_params_ctx = NULL;
                return false;
            }
        }
        {
            // first_stage_model(AutoEncoderKL)
            double ctx_size = 1 * 1024 * 1024;  // 1 MB, for padding
            ctx_size += first_stage_model.compute_params_mem_size(wtype);
            LOG_DEBUG("vae params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
            struct ggml_init_params params;
            params.mem_size = static_cast<size_t>(ctx_size);
            params.mem_buffer = NULL;
            params.no_alloc = false;
            params.dynamic = false;
            vae_params_ctx = ggml_init(params);
            if (!vae_params_ctx) {
                LOG_ERROR("ggml_init() failed");
                ggml_free(clip_params_ctx);
                clip_params_ctx = NULL;
                ggml_free(unet_params_ctx);
                unet_params_ctx = NULL;
                return false;
            }
        }
        std::map<std::string, struct ggml_tensor*> tensors;
        LOG_DEBUG("preparing memory for the weights");
        // prepare memory for the weights
        {
            // cond_stage_model(FrozenCLIPEmbedder)
-            cond_stage_model.text_model.init_params(params_ctx, wtype);
+            cond_stage_model.text_model.init_params(clip_params_ctx, wtype);
            cond_stage_model.text_model.map_by_name(tensors, "cond_stage_model.transformer.text_model.");
            // diffusion_model(UNetModel)
-            diffusion_model.init_params(params_ctx, wtype);
+            diffusion_model.init_params(unet_params_ctx, wtype);
            diffusion_model.map_by_name(tensors, "model.diffusion_model.");
            // firest_stage_model(AutoEncoderKL)
-            first_stage_model.init_params(params_ctx, wtype);
+            first_stage_model.init_params(vae_params_ctx, wtype);
            first_stage_model.map_by_name(tensors, "first_stage_model.");
        }
        LOG_DEBUG("loading weights");
        std::set<std::string> tensor_names_in_file;
        int64_t t0 = ggml_time_ms();
@@ -2707,8 +2767,16 @@ class StableDiffusionGGML {
                file.close();
                return false;
            }
-            LOG_DEBUG("model size = %8.2fMB", total_size / 1024.0 / 1024.0);
+            LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
        }
        max_params_mem_size = ggml_used_mem(clip_params_ctx) + ggml_used_mem(unet_params_ctx) + ggml_used_mem(vae_params_ctx);
        max_mem_size = max_params_mem_size;
        curr_params_mem_size = max_params_mem_size;
        LOG_INFO("total params size = %.2fMB (clip %.2fMB, unet %.2fMB, vae %.2fMB)",
                 max_params_mem_size / 1024.0 / 1024.0,
                 ggml_used_mem(clip_params_ctx) / 1024.0 / 1024.0,
                 ggml_used_mem(unet_params_ctx) / 1024.0 / 1024.0,
                 ggml_used_mem(vae_params_ctx) / 1024.0 / 1024.0);
        int64_t t1 = ggml_time_ms();
        LOG_INFO("loading model from '%s' completed, taking %.2fs", file_path.c_str(), (t1 - t0) * 1.0f / 1000);
        file.close();
@@ -2784,14 +2852,26 @@ class StableDiffusionGGML {
        copy_ggml_tensor(result, hidden_states);
        // print_ggml_tensor(result);
-        size_t rt_size = ctx_size + ggml_curr_max_dynamic_size();
+        size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
-        if (rt_size > max_rt_size) {
+        if (rt_mem_size > max_rt_mem_size) {
-            max_rt_size = rt_size;
+            max_rt_mem_size = rt_mem_size;
        }
-        LOG_INFO("condition graph use %.2fMB of memory: static %.2fMB, dynamic = %.2fMB",
+        size_t graph_mem_size = ggml_used_mem(clip_params_ctx) + rt_mem_size;
-                 rt_size * 1.0f / 1024 / 1024,
+
-                 ctx_size * 1.0f / 1024 / 1024,
+        size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
-                 ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+        if (curr_mem_size > max_mem_size) {
            max_mem_size = curr_mem_size;
        }
        LOG_INFO(
            "condition graph use %.2fMB of memory: params %.2fMB, "
            "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
            graph_mem_size * 1.0f / 1024 / 1024,
            ggml_used_mem(clip_params_ctx) * 1.0f / 1024 / 1024,
            rt_mem_size * 1.0f / 1024 / 1024,
            ctx_size * 1.0f / 1024 / 1024,
            ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
        LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
        ggml_free(ctx);
@@ -2985,21 +3065,33 @@ class StableDiffusionGGML {
                }
                int64_t t1 = ggml_time_ms();
                LOG_INFO("step %d sampling completed, taking %.2fs", i + 1, (t1 - t0) * 1.0f / 1000);
-                LOG_DEBUG("diffusion graph use %.2fMB of memory: static %.2fMB, dynamic = %.2fMB",
+                LOG_DEBUG("diffusion graph use %.2fMB runtime memory: static %.2fMB, dynamic %.2fMB",
                          (ctx_size + ggml_curr_max_dynamic_size()) * 1.0f / 1024 / 1024,
                          ctx_size * 1.0f / 1024 / 1024,
                          ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
                LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
            }
        }
-        size_t rt_size = ctx_size + ggml_curr_max_dynamic_size();
+
-        if (rt_size > max_rt_size) {
+        size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
-            max_rt_size = rt_size;
+        if (rt_mem_size > max_rt_mem_size) {
            max_rt_mem_size = rt_mem_size;
        }
-        LOG_INFO("diffusion graph use %.2fMB of memory: static %.2fMB, dynamic = %.2fMB",
+        size_t graph_mem_size = ggml_used_mem(unet_params_ctx) + rt_mem_size;
-                 rt_size * 1.0f / 1024 / 1024,
+
-                 ctx_size * 1.0f / 1024 / 1024,
+        size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
-                 ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+        if (curr_mem_size > max_mem_size) {
            max_mem_size = curr_mem_size;
        }
        LOG_INFO(
            "diffusion graph use %.2fMB of memory: params %.2fMB, "
            "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
            graph_mem_size * 1.0f / 1024 / 1024,
            ggml_used_mem(unet_params_ctx) * 1.0f / 1024 / 1024,
            rt_mem_size * 1.0f / 1024 / 1024,
            ctx_size * 1.0f / 1024 / 1024,
            ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
        LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
        ggml_free(ctx);
@@ -3065,14 +3157,25 @@ class StableDiffusionGGML {
            result = ggml_dup_tensor(res_ctx, moments);
            copy_ggml_tensor(result, moments);
-            size_t rt_size = ctx_size + ggml_curr_max_dynamic_size();
+            size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
-            if (rt_size > max_rt_size) {
+            if (rt_mem_size > max_rt_mem_size) {
-                max_rt_size = rt_size;
+                max_rt_mem_size = rt_mem_size;
            }
-            LOG_INFO("vae graph use %.2fMB of memory: static %.2fMB, dynamic = %.2fMB",
+            size_t graph_mem_size = ggml_used_mem(vae_params_ctx) + rt_mem_size;
-                     rt_size * 1.0f / 1024 / 1024,
+
-                     ctx_size * 1.0f / 1024 / 1024,
+            size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
-                     ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+            if (curr_mem_size > max_mem_size) {
                max_mem_size = curr_mem_size;
            }
            LOG_INFO(
                "vae graph use %.2fMB of memory: params %.2fMB, "
                "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
                graph_mem_size * 1.0f / 1024 / 1024,
                ggml_used_mem(vae_params_ctx) * 1.0f / 1024 / 1024,
                rt_mem_size * 1.0f / 1024 / 1024,
                ctx_size * 1.0f / 1024 / 1024,
                ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
            LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
            ggml_free(ctx);
@@ -3179,14 +3282,25 @@ class StableDiffusionGGML {
            result_img = ggml_dup_tensor(res_ctx, img);
            copy_ggml_tensor(result_img, img);
-            size_t rt_size = ctx_size + ggml_curr_max_dynamic_size();
+            size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
-            if (rt_size > max_rt_size) {
+            if (rt_mem_size > max_rt_mem_size) {
-                max_rt_size = rt_size;
+                max_rt_mem_size = rt_mem_size;
            }
-            LOG_INFO("vae graph use %.2fMB of memory: static %.2fMB, dynamic = %.2fMB",
+            size_t graph_mem_size = ggml_used_mem(vae_params_ctx) + rt_mem_size;
-                     rt_size * 1.0f / 1024 / 1024,
+
-                     ctx_size * 1.0f / 1024 / 1024,
+            size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
-                     ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+            if (curr_mem_size > max_mem_size) {
                max_mem_size = curr_mem_size;
            }
            LOG_INFO(
                "vae graph use %.2fMB of memory: params %.2fMB, "
                "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
                graph_mem_size * 1.0f / 1024 / 1024,
                ggml_used_mem(vae_params_ctx) * 1.0f / 1024 / 1024,
                rt_mem_size * 1.0f / 1024 / 1024,
                ctx_size * 1.0f / 1024 / 1024,
                ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
            LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
            ggml_free(ctx);
@@ -3198,8 +3312,12 @@ class StableDiffusionGGML {
 /*================================================= StableDiffusion ==================================================*/
-StableDiffusion::StableDiffusion(int n_threads, bool vae_decode_only) {
+StableDiffusion::StableDiffusion(int n_threads,
-    sd = std::make_shared<StableDiffusionGGML>(n_threads, vae_decode_only);
+                                 bool vae_decode_only,
                                 bool free_params_immediately) {
    sd = std::make_shared<StableDiffusionGGML>(n_threads,
                                               vae_decode_only,
                                               free_params_immediately);
 }
 bool StableDiffusion::load_from_file(const std::string& file_path) {
@@ -3240,6 +3358,12 @@ std::vector<uint8_t> StableDiffusion::txt2img(const std::string& prompt,
    int64_t t1 = ggml_time_ms();
    LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
    if (sd->free_params_immediately) {
        sd->curr_params_mem_size -= ggml_used_mem(sd->clip_params_ctx);
        ggml_free(sd->clip_params_ctx);
        sd->clip_params_ctx = NULL;
    }
    int C = 4;
    int W = width / 8;
    int H = height / 8;
@@ -3255,18 +3379,32 @@ std::vector<uint8_t> StableDiffusion::txt2img(const std::string& prompt,
    int64_t t2 = ggml_time_ms();
    LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
    if (sd->free_params_immediately) {
        sd->curr_params_mem_size -= ggml_used_mem(sd->unet_params_ctx);
        ggml_free(sd->unet_params_ctx);
        sd->unet_params_ctx = NULL;
    }
    struct ggml_tensor* img = sd->decode_first_stage(ctx, x_0);
    if (img != NULL) {
        result = ggml_to_image_vec(img);
    }
    int64_t t3 = ggml_time_ms();
    LOG_INFO("decode_first_stage completed, taking %.2fs", (t3 - t2) * 1.0f / 1000);
    if (sd->free_params_immediately) {
        sd->curr_params_mem_size -= ggml_used_mem(sd->vae_params_ctx);
        ggml_free(sd->vae_params_ctx);
        sd->vae_params_ctx = NULL;
    }
    LOG_INFO(
-        "txt2img completed in %.2fs, "
+        "txt2img completed in %.2fs, use %.2fMB of memory: peak params memory %.2fMB, "
-        "with a runtime memory usage of %.2fMB and parameter memory usage of %.2fMB",
+        "peak runtime memory %.2fMB",
        (t3 - t0) * 1.0f / 1000,
-        sd->max_rt_size * 1.0f / 1024 / 1024,
+        sd->max_mem_size * 1.0f / 1024 / 1024,
-        ggml_used_mem(sd->params_ctx) * 1.0f / 1024 / 1024);
+        sd->max_params_mem_size * 1.0f / 1024 / 1024,
        sd->max_rt_mem_size * 1.0f / 1024 / 1024);
    ggml_free(ctx);
    return result;
@@ -3330,6 +3468,11 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
    }
    int64_t t2 = ggml_time_ms();
    LOG_INFO("get_learned_condition completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
    if (sd->free_params_immediately) {
        sd->curr_params_mem_size -= ggml_used_mem(sd->clip_params_ctx);
        ggml_free(sd->clip_params_ctx);
        sd->clip_params_ctx = NULL;
    }
    LOG_INFO("start sampling");
    struct ggml_tensor* x_0 = sd->sample(ctx, init_latent, c, uc, cfg_scale, sample_method, sigma_sched);
@@ -3337,6 +3480,11 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
    // print_ggml_tensor(x_0);
    int64_t t3 = ggml_time_ms();
    LOG_INFO("sampling completed, taking %.2fs", (t3 - t2) * 1.0f / 1000);
    if (sd->free_params_immediately) {
        sd->curr_params_mem_size -= ggml_used_mem(sd->unet_params_ctx);
        ggml_free(sd->unet_params_ctx);
        sd->unet_params_ctx = NULL;
    }
    struct ggml_tensor* img = sd->decode_first_stage(ctx, x_0);
    if (img != NULL) {
@@ -3344,12 +3492,20 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
    }
    int64_t t4 = ggml_time_ms();
    LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
    if (sd->free_params_immediately) {
        sd->curr_params_mem_size -= ggml_used_mem(sd->vae_params_ctx);
        ggml_free(sd->vae_params_ctx);
        sd->vae_params_ctx = NULL;
    }
    LOG_INFO(
-        "img2img completed in %.2fs, "
+        "img2img completed in %.2fs, use %.2fMB of memory: peak params memory %.2fMB, "
-        "with a runtime memory usage of %.2fMB and parameter memory usage of %.2fMB",
+        "peak runtime memory %.2fMB",
        (t4 - t0) * 1.0f / 1000,
-        sd->max_rt_size * 1.0f / 1024 / 1024,
+        sd->max_mem_size * 1.0f / 1024 / 1024,
-        ggml_used_mem(sd->params_ctx) * 1.0f / 1024 / 1024);
+        sd->max_params_mem_size * 1.0f / 1024 / 1024,
        sd->max_rt_mem_size * 1.0f / 1024 / 1024);
    ggml_free(ctx);
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -23,7 +23,8 @@ class StableDiffusion {
   public:
    StableDiffusion(int n_threads = -1,
-                    bool vae_decode_only = false);
+                    bool vae_decode_only = false,
                    bool free_params_immediately = false);
    bool load_from_file(const std::string& file_path);
    std::vector<uint8_t> txt2img(
        const std::string& prompt,