fix: improve VAE tiling (#372)

* fix and improve: VAE tiling - properly handle the upper left corner interpolating both x and y - refactor out lerp - use smootherstep to preserve more detail and spend less area blending * actually fix vae tile merging Co-authored-by: stduhpf <stephduh@live.fr> * remove the now unused lerp function --------- Co-authored-by: stduhpf <stephduh@live.fr>
2024-08-27 18:21:12 +02:00
parent f4c937cb94
commit e71ddcedad
1 changed files with 26 additions and 9 deletions
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -353,6 +353,12 @@ __STATIC_INLINE__ void ggml_split_tensor_2d(struct ggml_tensor* input,
    }
 }

+// unclamped -> expects x in the range [0-1]
+__STATIC_INLINE__ float ggml_smootherstep_f32(const float x) {
+    GGML_ASSERT(x >= 0.f && x <= 1.f);
+    return x * x * x * (x * (6.0f * x - 15.0f) + 10.0f);
+}
+
 __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
                                            struct ggml_tensor* output,
                                            int x,
@@ -361,6 +367,10 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
    int64_t width    = input->ne[0];
    int64_t height   = input->ne[1];
    int64_t channels = input->ne[2];
+
+    int64_t img_width    = output->ne[0];
+    int64_t img_height   = output->ne[1];
+
    GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
    for (int iy = 0; iy < height; iy++) {
        for (int ix = 0; ix < width; ix++) {
@@ -368,16 +378,23 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
                float new_value = ggml_tensor_get_f32(input, ix, iy, k);
                if (overlap > 0) {  // blend colors in overlapped area
                    float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k);
-                    if (x > 0 && ix < overlap) {  // in overlapped horizontal
-                        ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (ix / (1.0f * overlap)), x + ix, y + iy, k);
-                        continue;
-                    }
-                    if (y > 0 && iy < overlap) {  // in overlapped vertical
-                        ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (iy / (1.0f * overlap)), x + ix, y + iy, k);
-                        continue;
-                    }
+
+                    const float x_f_0 = (x > 0) ? ix / float(overlap) : 1;
+                    const float x_f_1 = (x < (img_width - width)) ? (width - ix) / float(overlap) : 1 ;
+                    const float y_f_0 = (y > 0) ? iy / float(overlap) : 1;
+                    const float y_f_1 = (y < (img_height - height)) ? (height - iy) / float(overlap) : 1;
+
+                    const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f);
+                    const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f);
+
+                    ggml_tensor_set_f32(
+                        output,
+                        old_value + new_value * ggml_smootherstep_f32(y_f) * ggml_smootherstep_f32(x_f),
+                        x + ix, y + iy, k
+                    );
+                } else {
+                    ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
                }
-                ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
            }
        }
    }