diff --git a/README.md b/README.md
index 45113d0..c3432bc 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
 - 16-bit, 32-bit float support
 - 4-bit, 5-bit and 8-bit integer quantization support
 - Accelerated memory-efficient CPU inference
+    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Original `txt2img` and `img2img` mode
 - Negative prompt
@@ -152,8 +153,8 @@ Using formats of different precisions will yield results of varying quality.
 
 | precision | f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
 | ----         | ----  |----  |----  |----  |----  |----  |----  |
-|  **Disk**        | 2.8G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
-|  **Memory**(txt2img - 512 x 512) | ~4.9G | ~4.1G | ~3.8G | ~3.7G | ~3.7G | ~3.6G | ~3.6G |
+|  **Disk**        | 2.7G | 2.0G | 1.7G | 1.6G | 1.6G | 1.5G | 1.5G |
+|  **Memory**(txt2img - 512 x 512) | ~2.8G | ~2.3G | ~2.1G | ~2.0G | ~2.0G | ~2.0G | ~2.0G |
 
 
 ## References
diff --git a/main.cpp b/main.cpp
index 8f9d1e8..0412d43 100644
--- a/main.cpp
+++ b/main.cpp
@@ -322,7 +322,8 @@ int main(int argc, const char* argv[]) {
         }
         init_img.assign(img_data, img_data + (opt.w * opt.h * c));
     }
-    StableDiffusion sd(opt.n_threads, vae_decode_only);
+
+    StableDiffusion sd(opt.n_threads, vae_decode_only, true);
     if (!sd.load_from_file(opt.model_path)) {
         return 1;
     }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 9cf5cd5..0c392a1 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1186,7 +1186,9 @@ struct DownSample {
     static void asymmetric_pad(struct ggml_tensor* dst,
                                const struct ggml_tensor* a,
                                const struct ggml_tensor* b,
-                               int ith, int nth, void * userdata) {
+                               int ith,
+                               int nth,
+                               void* userdata) {
         assert(sizeof(dst->nb[0]) == sizeof(float));
         assert(sizeof(a->nb[0]) == sizeof(float));
         assert(sizeof(b->nb[0]) == sizeof(float));
@@ -1450,6 +1452,8 @@ struct UNetModel {
         mem_size += out_channels * model_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16);  // out_2_w
         mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32);                           // out_2_b
 
+        mem_size += 4 * ggml_tensor_overhead();
+
         return static_cast<size_t>(mem_size);
     }
 
@@ -2470,13 +2474,20 @@ struct CompVisDenoiser {
 
 class StableDiffusionGGML {
    public:
-    ggml_context* params_ctx = NULL;
+    ggml_context* clip_params_ctx = NULL;
+    ggml_context* unet_params_ctx = NULL;
+    ggml_context* vae_params_ctx = NULL;
+
     bool dynamic = true;
-    bool vae_decode_only = true;
+    bool vae_decode_only = false;
+    bool free_params_immediately = false;
     int32_t ftype = 1;
     int n_threads = -1;
     float scale_factor = 0.18215f;
-    size_t max_rt_size = 0;
+    size_t max_mem_size = 0;
+    size_t curr_params_mem_size = 0;
+    size_t max_params_mem_size = 0;
+    size_t max_rt_mem_size = 0;
 
     FrozenCLIPEmbedder cond_stage_model;
     UNetModel diffusion_model;
@@ -2484,19 +2495,29 @@ class StableDiffusionGGML {
 
     CompVisDenoiser denoiser;
 
-    std::map<std::string, struct ggml_tensor*> tensors;
-
     StableDiffusionGGML() = default;
 
-    StableDiffusionGGML(int n_threads, bool vae_decode_only)
-        : n_threads(n_threads), vae_decode_only(vae_decode_only) {
+    StableDiffusionGGML(int n_threads,
+                        bool vae_decode_only,
+                        bool free_params_immediately)
+        : n_threads(n_threads),
+          vae_decode_only(vae_decode_only),
+          free_params_immediately(free_params_immediately) {
         first_stage_model.decode_only = vae_decode_only;
     }
 
     ~StableDiffusionGGML() {
-        if (params_ctx != NULL) {
-            ggml_free(params_ctx);
-            params_ctx = NULL;
+        if (clip_params_ctx != NULL) {
+            ggml_free(clip_params_ctx);
+            clip_params_ctx = NULL;
+        }
+        if (unet_params_ctx != NULL) {
+            ggml_free(unet_params_ctx);
+            unet_params_ctx = NULL;
+        }
+        if (vae_params_ctx != NULL) {
+            ggml_free(vae_params_ctx);
+            vae_params_ctx = NULL;
         }
     }
 
@@ -2559,50 +2580,89 @@ class StableDiffusionGGML {
             }
         }
 
-        double ctx_size = 0;
+        // create the ggml context for network params
+        LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
         {
             // cond_stage_model(FrozenCLIPEmbedder)
+            double ctx_size = 1 * 1024 * 1024;  // 1 MB, for padding
             ctx_size += cond_stage_model.text_model.compute_params_mem_size(wtype);
+            LOG_DEBUG("clip params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
 
-            // diffusion_model(UNetModel)
-            ctx_size += diffusion_model.compute_params_mem_size(wtype);
-
-            // first_stage_model(AutoEncoderKL)
-            ctx_size += first_stage_model.compute_params_mem_size(wtype);
-
-            LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
-            LOG_INFO("params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
-        }
-
-        // create the ggml context for network params
-        {
             struct ggml_init_params params;
             params.mem_size = static_cast<size_t>(ctx_size);
             params.mem_buffer = NULL;
             params.no_alloc = false;
             params.dynamic = false;
 
-            params_ctx = ggml_init(params);
-            if (!params_ctx) {
+            clip_params_ctx = ggml_init(params);
+            if (!clip_params_ctx) {
                 LOG_ERROR("ggml_init() failed");
                 return false;
             }
         }
+
+        {
+            // diffusion_model(UNetModel)
+            double ctx_size = 1 * 1024 * 1024;  // 1 MB, for padding
+            ctx_size += diffusion_model.compute_params_mem_size(wtype);
+            LOG_DEBUG("unet params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
+
+            struct ggml_init_params params;
+            params.mem_size = static_cast<size_t>(ctx_size);
+            params.mem_buffer = NULL;
+            params.no_alloc = false;
+            params.dynamic = false;
+
+            unet_params_ctx = ggml_init(params);
+            if (!unet_params_ctx) {
+                LOG_ERROR("ggml_init() failed");
+                ggml_free(clip_params_ctx);
+                clip_params_ctx = NULL;
+                return false;
+            }
+        }
+
+        {
+            // first_stage_model(AutoEncoderKL)
+            double ctx_size = 1 * 1024 * 1024;  // 1 MB, for padding
+            ctx_size += first_stage_model.compute_params_mem_size(wtype);
+            LOG_DEBUG("vae params ctx size = % 6.2f MB", ctx_size / (1024.0 * 1024.0));
+
+            struct ggml_init_params params;
+            params.mem_size = static_cast<size_t>(ctx_size);
+            params.mem_buffer = NULL;
+            params.no_alloc = false;
+            params.dynamic = false;
+
+            vae_params_ctx = ggml_init(params);
+            if (!vae_params_ctx) {
+                LOG_ERROR("ggml_init() failed");
+                ggml_free(clip_params_ctx);
+                clip_params_ctx = NULL;
+                ggml_free(unet_params_ctx);
+                unet_params_ctx = NULL;
+                return false;
+            }
+        }
+
+        std::map<std::string, struct ggml_tensor*> tensors;
+
         LOG_DEBUG("preparing memory for the weights");
         // prepare memory for the weights
         {
             // cond_stage_model(FrozenCLIPEmbedder)
-            cond_stage_model.text_model.init_params(params_ctx, wtype);
+            cond_stage_model.text_model.init_params(clip_params_ctx, wtype);
             cond_stage_model.text_model.map_by_name(tensors, "cond_stage_model.transformer.text_model.");
 
             // diffusion_model(UNetModel)
-            diffusion_model.init_params(params_ctx, wtype);
+            diffusion_model.init_params(unet_params_ctx, wtype);
             diffusion_model.map_by_name(tensors, "model.diffusion_model.");
 
             // firest_stage_model(AutoEncoderKL)
-            first_stage_model.init_params(params_ctx, wtype);
+            first_stage_model.init_params(vae_params_ctx, wtype);
             first_stage_model.map_by_name(tensors, "first_stage_model.");
         }
+
         LOG_DEBUG("loading weights");
         std::set<std::string> tensor_names_in_file;
         int64_t t0 = ggml_time_ms();
@@ -2707,8 +2767,16 @@ class StableDiffusionGGML {
                 file.close();
                 return false;
             }
-            LOG_DEBUG("model size = %8.2fMB", total_size / 1024.0 / 1024.0);
+            LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
         }
+        max_params_mem_size = ggml_used_mem(clip_params_ctx) + ggml_used_mem(unet_params_ctx) + ggml_used_mem(vae_params_ctx);
+        max_mem_size = max_params_mem_size;
+        curr_params_mem_size = max_params_mem_size;
+        LOG_INFO("total params size = %.2fMB (clip %.2fMB, unet %.2fMB, vae %.2fMB)",
+                 max_params_mem_size / 1024.0 / 1024.0,
+                 ggml_used_mem(clip_params_ctx) / 1024.0 / 1024.0,
+                 ggml_used_mem(unet_params_ctx) / 1024.0 / 1024.0,
+                 ggml_used_mem(vae_params_ctx) / 1024.0 / 1024.0);
         int64_t t1 = ggml_time_ms();
         LOG_INFO("loading model from '%s' completed, taking %.2fs", file_path.c_str(), (t1 - t0) * 1.0f / 1000);
         file.close();
@@ -2784,14 +2852,26 @@ class StableDiffusionGGML {
         copy_ggml_tensor(result, hidden_states);
 
         // print_ggml_tensor(result);
-        size_t rt_size = ctx_size + ggml_curr_max_dynamic_size();
-        if (rt_size > max_rt_size) {
-            max_rt_size = rt_size;
+        size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
+        if (rt_mem_size > max_rt_mem_size) {
+            max_rt_mem_size = rt_mem_size;
         }
-        LOG_INFO("condition graph use %.2fMB of memory: static %.2fMB, dynamic = %.2fMB",
-                 rt_size * 1.0f / 1024 / 1024,
-                 ctx_size * 1.0f / 1024 / 1024,
-                 ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+        size_t graph_mem_size = ggml_used_mem(clip_params_ctx) + rt_mem_size;
+
+        size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
+        if (curr_mem_size > max_mem_size) {
+            max_mem_size = curr_mem_size;
+        }
+
+        LOG_INFO(
+            "condition graph use %.2fMB of memory: params %.2fMB, "
+            "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
+            graph_mem_size * 1.0f / 1024 / 1024,
+            ggml_used_mem(clip_params_ctx) * 1.0f / 1024 / 1024,
+            rt_mem_size * 1.0f / 1024 / 1024,
+            ctx_size * 1.0f / 1024 / 1024,
+            ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+
         LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
 
         ggml_free(ctx);
@@ -2985,21 +3065,33 @@ class StableDiffusionGGML {
                 }
                 int64_t t1 = ggml_time_ms();
                 LOG_INFO("step %d sampling completed, taking %.2fs", i + 1, (t1 - t0) * 1.0f / 1000);
-                LOG_DEBUG("diffusion graph use %.2fMB of memory: static %.2fMB, dynamic = %.2fMB",
+                LOG_DEBUG("diffusion graph use %.2fMB runtime memory: static %.2fMB, dynamic %.2fMB",
                           (ctx_size + ggml_curr_max_dynamic_size()) * 1.0f / 1024 / 1024,
                           ctx_size * 1.0f / 1024 / 1024,
                           ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
                 LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
             }
         }
-        size_t rt_size = ctx_size + ggml_curr_max_dynamic_size();
-        if (rt_size > max_rt_size) {
-            max_rt_size = rt_size;
+
+        size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
+        if (rt_mem_size > max_rt_mem_size) {
+            max_rt_mem_size = rt_mem_size;
         }
-        LOG_INFO("diffusion graph use %.2fMB of memory: static %.2fMB, dynamic = %.2fMB",
-                 rt_size * 1.0f / 1024 / 1024,
-                 ctx_size * 1.0f / 1024 / 1024,
-                 ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+        size_t graph_mem_size = ggml_used_mem(unet_params_ctx) + rt_mem_size;
+
+        size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
+        if (curr_mem_size > max_mem_size) {
+            max_mem_size = curr_mem_size;
+        }
+
+        LOG_INFO(
+            "diffusion graph use %.2fMB of memory: params %.2fMB, "
+            "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
+            graph_mem_size * 1.0f / 1024 / 1024,
+            ggml_used_mem(unet_params_ctx) * 1.0f / 1024 / 1024,
+            rt_mem_size * 1.0f / 1024 / 1024,
+            ctx_size * 1.0f / 1024 / 1024,
+            ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
         LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
 
         ggml_free(ctx);
@@ -3065,14 +3157,25 @@ class StableDiffusionGGML {
             result = ggml_dup_tensor(res_ctx, moments);
             copy_ggml_tensor(result, moments);
 
-            size_t rt_size = ctx_size + ggml_curr_max_dynamic_size();
-            if (rt_size > max_rt_size) {
-                max_rt_size = rt_size;
+            size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
+            if (rt_mem_size > max_rt_mem_size) {
+                max_rt_mem_size = rt_mem_size;
             }
-            LOG_INFO("vae graph use %.2fMB of memory: static %.2fMB, dynamic = %.2fMB",
-                     rt_size * 1.0f / 1024 / 1024,
-                     ctx_size * 1.0f / 1024 / 1024,
-                     ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+            size_t graph_mem_size = ggml_used_mem(vae_params_ctx) + rt_mem_size;
+
+            size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
+            if (curr_mem_size > max_mem_size) {
+                max_mem_size = curr_mem_size;
+            }
+
+            LOG_INFO(
+                "vae graph use %.2fMB of memory: params %.2fMB, "
+                "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
+                graph_mem_size * 1.0f / 1024 / 1024,
+                ggml_used_mem(vae_params_ctx) * 1.0f / 1024 / 1024,
+                rt_mem_size * 1.0f / 1024 / 1024,
+                ctx_size * 1.0f / 1024 / 1024,
+                ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
             LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
 
             ggml_free(ctx);
@@ -3179,14 +3282,25 @@ class StableDiffusionGGML {
             result_img = ggml_dup_tensor(res_ctx, img);
             copy_ggml_tensor(result_img, img);
 
-            size_t rt_size = ctx_size + ggml_curr_max_dynamic_size();
-            if (rt_size > max_rt_size) {
-                max_rt_size = rt_size;
+            size_t rt_mem_size = ctx_size + ggml_curr_max_dynamic_size();
+            if (rt_mem_size > max_rt_mem_size) {
+                max_rt_mem_size = rt_mem_size;
             }
-            LOG_INFO("vae graph use %.2fMB of memory: static %.2fMB, dynamic = %.2fMB",
-                     rt_size * 1.0f / 1024 / 1024,
-                     ctx_size * 1.0f / 1024 / 1024,
-                     ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
+            size_t graph_mem_size = ggml_used_mem(vae_params_ctx) + rt_mem_size;
+
+            size_t curr_mem_size = curr_params_mem_size + rt_mem_size;
+            if (curr_mem_size > max_mem_size) {
+                max_mem_size = curr_mem_size;
+            }
+
+            LOG_INFO(
+                "vae graph use %.2fMB of memory: params %.2fMB, "
+                "runtime %.2fMB (static %.2fMB, dynamic %.2fMB)",
+                graph_mem_size * 1.0f / 1024 / 1024,
+                ggml_used_mem(vae_params_ctx) * 1.0f / 1024 / 1024,
+                rt_mem_size * 1.0f / 1024 / 1024,
+                ctx_size * 1.0f / 1024 / 1024,
+                ggml_curr_max_dynamic_size() * 1.0f / 1024 / 1024);
             LOG_DEBUG("%zu bytes of dynamic memory has not been released yet", ggml_dynamic_size());
 
             ggml_free(ctx);
@@ -3198,8 +3312,12 @@ class StableDiffusionGGML {
 
 /*================================================= StableDiffusion ==================================================*/
 
-StableDiffusion::StableDiffusion(int n_threads, bool vae_decode_only) {
-    sd = std::make_shared<StableDiffusionGGML>(n_threads, vae_decode_only);
+StableDiffusion::StableDiffusion(int n_threads,
+                                 bool vae_decode_only,
+                                 bool free_params_immediately) {
+    sd = std::make_shared<StableDiffusionGGML>(n_threads,
+                                               vae_decode_only,
+                                               free_params_immediately);
 }
 
 bool StableDiffusion::load_from_file(const std::string& file_path) {
@@ -3240,6 +3358,12 @@ std::vector<uint8_t> StableDiffusion::txt2img(const std::string& prompt,
     int64_t t1 = ggml_time_ms();
     LOG_INFO("get_learned_condition completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
 
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->clip_params_ctx);
+        ggml_free(sd->clip_params_ctx);
+        sd->clip_params_ctx = NULL;
+    }
+
     int C = 4;
     int W = width / 8;
     int H = height / 8;
@@ -3255,18 +3379,32 @@ std::vector<uint8_t> StableDiffusion::txt2img(const std::string& prompt,
     int64_t t2 = ggml_time_ms();
     LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
 
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->unet_params_ctx);
+        ggml_free(sd->unet_params_ctx);
+        sd->unet_params_ctx = NULL;
+    }
+
     struct ggml_tensor* img = sd->decode_first_stage(ctx, x_0);
     if (img != NULL) {
         result = ggml_to_image_vec(img);
     }
     int64_t t3 = ggml_time_ms();
     LOG_INFO("decode_first_stage completed, taking %.2fs", (t3 - t2) * 1.0f / 1000);
+
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->vae_params_ctx);
+        ggml_free(sd->vae_params_ctx);
+        sd->vae_params_ctx = NULL;
+    }
+
     LOG_INFO(
-        "txt2img completed in %.2fs, "
-        "with a runtime memory usage of %.2fMB and parameter memory usage of %.2fMB",
+        "txt2img completed in %.2fs, use %.2fMB of memory: peak params memory %.2fMB, "
+        "peak runtime memory %.2fMB",
         (t3 - t0) * 1.0f / 1000,
-        sd->max_rt_size * 1.0f / 1024 / 1024,
-        ggml_used_mem(sd->params_ctx) * 1.0f / 1024 / 1024);
+        sd->max_mem_size * 1.0f / 1024 / 1024,
+        sd->max_params_mem_size * 1.0f / 1024 / 1024,
+        sd->max_rt_mem_size * 1.0f / 1024 / 1024);
 
     ggml_free(ctx);
     return result;
@@ -3330,6 +3468,11 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
     }
     int64_t t2 = ggml_time_ms();
     LOG_INFO("get_learned_condition completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->clip_params_ctx);
+        ggml_free(sd->clip_params_ctx);
+        sd->clip_params_ctx = NULL;
+    }
 
     LOG_INFO("start sampling");
     struct ggml_tensor* x_0 = sd->sample(ctx, init_latent, c, uc, cfg_scale, sample_method, sigma_sched);
@@ -3337,6 +3480,11 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
     // print_ggml_tensor(x_0);
     int64_t t3 = ggml_time_ms();
     LOG_INFO("sampling completed, taking %.2fs", (t3 - t2) * 1.0f / 1000);
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->unet_params_ctx);
+        ggml_free(sd->unet_params_ctx);
+        sd->unet_params_ctx = NULL;
+    }
 
     struct ggml_tensor* img = sd->decode_first_stage(ctx, x_0);
     if (img != NULL) {
@@ -3344,12 +3492,20 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
     }
     int64_t t4 = ggml_time_ms();
     LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
+
+    if (sd->free_params_immediately) {
+        sd->curr_params_mem_size -= ggml_used_mem(sd->vae_params_ctx);
+        ggml_free(sd->vae_params_ctx);
+        sd->vae_params_ctx = NULL;
+    }
+
     LOG_INFO(
-        "img2img completed in %.2fs, "
-        "with a runtime memory usage of %.2fMB and parameter memory usage of %.2fMB",
+        "img2img completed in %.2fs, use %.2fMB of memory: peak params memory %.2fMB, "
+        "peak runtime memory %.2fMB",
         (t4 - t0) * 1.0f / 1000,
-        sd->max_rt_size * 1.0f / 1024 / 1024,
-        ggml_used_mem(sd->params_ctx) * 1.0f / 1024 / 1024);
+        sd->max_mem_size * 1.0f / 1024 / 1024,
+        sd->max_params_mem_size * 1.0f / 1024 / 1024,
+        sd->max_rt_mem_size * 1.0f / 1024 / 1024);
 
     ggml_free(ctx);
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 4d15fa8..730a655 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -23,7 +23,8 @@ class StableDiffusion {
 
    public:
     StableDiffusion(int n_threads = -1,
-                    bool vae_decode_only = false);
+                    bool vae_decode_only = false,
+                    bool free_params_immediately = false);
     bool load_from_file(const std::string& file_path);
     std::vector<uint8_t> txt2img(
         const std::string& prompt,