stable-diffusion.cpp/stable-diffusion.cpp
2023-12-30 14:24:45 +08:00

6727 lines
269 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <assert.h>
#include <inttypes.h>
#include <stdarg.h>
#include <algorithm>
#include <cstring>
#include <fstream>
#include <iostream>
#include <iterator>
#include <map>
#include <random>
#include <regex>
#include <set>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "ggml/ggml-alloc.h"
#include "ggml/ggml-backend.h"
#include "ggml/ggml.h"
#ifdef SD_USE_CUBLAS
#include "ggml-cuda.h"
#endif
#ifdef SD_USE_METAL
#include "ggml-metal.h"
#endif
#include "model.h"
#include "rng.h"
#include "rng_philox.h"
#include "stable-diffusion.h"
#include "util.h"
#define EPS 1e-05f
#define UNET_GRAPH_SIZE 10240
#define LORA_GRAPH_SIZE 10240
#define TIMESTEPS 1000
const char* model_version_to_str[] = {
"1.x",
"2.x",
"XL",
};
const char* sampling_methods_str[] = {
"Euler A",
"Euler",
"Heun",
"DPM2",
"DPM++ (2s)",
"DPM++ (2M)",
"modified DPM++ (2M)",
"LCM",
};
/*================================================== Helper Functions ================================================*/
std::string sd_get_system_info() {
std::stringstream ss;
ss << "System Info: \n";
ss << " BLAS = " << ggml_cpu_has_blas() << std::endl;
ss << " SSE3 = " << ggml_cpu_has_sse3() << std::endl;
ss << " AVX = " << ggml_cpu_has_avx() << std::endl;
ss << " AVX2 = " << ggml_cpu_has_avx2() << std::endl;
ss << " AVX512 = " << ggml_cpu_has_avx512() << std::endl;
ss << " AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << std::endl;
ss << " AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << std::endl;
ss << " FMA = " << ggml_cpu_has_fma() << std::endl;
ss << " NEON = " << ggml_cpu_has_neon() << std::endl;
ss << " ARM_FMA = " << ggml_cpu_has_arm_fma() << std::endl;
ss << " F16C = " << ggml_cpu_has_f16c() << std::endl;
ss << " FP16_VA = " << ggml_cpu_has_fp16_va() << std::endl;
ss << " WASM_SIMD = " << ggml_cpu_has_wasm_simd() << std::endl;
ss << " VSX = " << ggml_cpu_has_vsx() << std::endl;
return ss.str();
}
static void ggml_log_callback_default(ggml_log_level level, const char* text, void* user_data) {
(void)level;
(void)user_data;
fputs(text, stderr);
fflush(stderr);
}
void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
uint32_t n = (uint32_t)ggml_nelements(tensor);
std::vector<float> random_numbers = rng->randn(n);
for (uint32_t i = 0; i < n; i++) {
ggml_set_f32_1d(tensor, i, random_numbers[i]);
}
}
void pretty_progress(int step, int steps, float time) {
std::string progress = " |";
int max_progress = 50;
int32_t current = (int32_t)(step * 1.f * max_progress / steps);
for (int i = 0; i < 50; i++) {
if (i > current) {
progress += " ";
} else if (i == current && i != max_progress - 1) {
progress += ">";
} else {
progress += "=";
}
}
progress += "|";
printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s",
progress.c_str(), step, steps,
time > 1.0f || time == 0 ? time : (1.0f / time));
fflush(stdout); // for linux
if (step == steps) {
printf("\n");
}
}
// set tensor[i, j, k, l]
// set tensor[l]
// set tensor[k, l]
// set tensor[j, k, l]
void ggml_tensor_set_f32(struct ggml_tensor* tensor, float value, int l, int k = 0, int j = 0, int i = 0) {
GGML_ASSERT(tensor->nb[0] == sizeof(float));
*(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]) = value;
}
float ggml_tensor_get_f32(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
// float value;
// ggml_backend_tensor_get(tensor, &value, i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0], sizeof(float));
// return value;
GGML_ASSERT(tensor->nb[0] == sizeof(float));
return *(float*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
}
ggml_fp16_t ggml_tensor_get_f16(const ggml_tensor* tensor, int l, int k = 0, int j = 0, int i = 0) {
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
return *(ggml_fp16_t*)((char*)(tensor->data) + i * tensor->nb[3] + j * tensor->nb[2] + k * tensor->nb[1] + l * tensor->nb[0]);
}
void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false) {
printf("shape(%zu, %zu, %zu, %zu)\n", tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
fflush(stdout);
if (shape_only) {
return;
}
int range = 3;
for (int i = 0; i < tensor->ne[3]; i++) {
if (i >= range && i + range < tensor->ne[3]) {
continue;
}
for (int j = 0; j < tensor->ne[2]; j++) {
if (j >= range && j + range < tensor->ne[2]) {
continue;
}
for (int k = 0; k < tensor->ne[1]; k++) {
if (k >= range && k + range < tensor->ne[1]) {
continue;
}
for (int l = 0; l < tensor->ne[0]; l++) {
if (l >= range && l + range < tensor->ne[0]) {
continue;
}
if (tensor->type == GGML_TYPE_F32) {
printf(" [%d, %d, %d, %d] = %f\n", i, j, k, l, ggml_tensor_get_f32(tensor, l, k, j, i));
} else if (tensor->type == GGML_TYPE_F16) {
printf(" [%d, %d, %d, %d] = %i\n", i, j, k, l, ggml_tensor_get_f16(tensor, l, k, j, i));
}
fflush(stdout);
}
}
}
}
}
ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_path) {
std::ifstream file(file_path, std::ios::binary);
if (!file.is_open()) {
LOG_ERROR("failed to open '%s'", file_path.c_str());
return NULL;
}
int32_t n_dims;
int32_t length;
int32_t ttype;
file.read(reinterpret_cast<char*>(&n_dims), sizeof(n_dims));
file.read(reinterpret_cast<char*>(&length), sizeof(length));
file.read(reinterpret_cast<char*>(&ttype), sizeof(ttype));
if (file.eof()) {
LOG_ERROR("incomplete file '%s'", file_path.c_str());
return NULL;
}
int32_t nelements = 1;
int32_t ne[4] = {1, 1, 1, 1};
for (int i = 0; i < n_dims; ++i) {
file.read(reinterpret_cast<char*>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
file.read(&name[0], length);
ggml_tensor* tensor = ggml_new_tensor_4d(ctx, (ggml_type)ttype, ne[0], ne[1], ne[2], ne[3]);
const size_t bpe = ggml_type_size(ggml_type(ttype));
file.read(reinterpret_cast<char*>(tensor->data), ggml_nbytes(tensor));
return tensor;
}
// void save_tensor_to_file(const std::string& file_name, ggml_tensor* tensor, const std::string & name) {
// std::string file_name_ = file_name + ".tensor";
// std::string name_ = name;
// std::ofstream file("./" + file_name_, std::ios::binary);
// file.write(reinterpret_cast<char*>(&tensor->n_dims), sizeof(tensor->n_dims));
// int len = (int)name_.size();
// file.write(reinterpret_cast<char*>(&len), sizeof(len));
// int ttype = (int)tensor->type;
// file.write(reinterpret_cast<char*>(&ttype), sizeof(ttype));
// for (int i = 0; i < tensor->n_dims; ++i) {
// int ne_ = (int) tensor->ne[i];
// file.write(reinterpret_cast<char*>(&ne_), sizeof(ne_));
// }
// file.write(&name_[0], len);
// char* data = nullptr;
// file.write((char*)tensor->data, ggml_nbytes(tensor));
// file.close();
// }
void sd_fread(void* ptr, size_t size, size_t count, FILE* stream) {
size_t ret = std::fread(ptr, size, count, stream);
if (ret != count) {
printf("Error: read from file failed");
exit(1);
}
}
void copy_ggml_tensor(struct ggml_tensor* dst, struct ggml_tensor* src) {
if (dst->type == src->type) {
dst->nb[0] = src->nb[0];
dst->nb[1] = src->nb[1];
dst->nb[2] = src->nb[2];
dst->nb[3] = src->nb[3];
memcpy(((char*)dst->data), ((char*)src->data), ggml_nbytes(dst));
return;
}
struct ggml_init_params params;
params.mem_size = 10 * 1024 * 1024; // for padding
params.mem_buffer = NULL;
params.no_alloc = false;
struct ggml_context* ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return;
}
ggml_tensor* final = ggml_cpy_inplace(ctx, src, dst);
struct ggml_cgraph* graph = ggml_new_graph(ctx);
ggml_build_forward_expand(graph, final);
ggml_graph_compute_with_ctx(ctx, graph, 1);
ggml_free(ctx);
}
void calculate_alphas_cumprod(float* alphas_cumprod,
float linear_start = 0.00085f,
float linear_end = 0.0120,
int timesteps = TIMESTEPS) {
float ls_sqrt = sqrtf(linear_start);
float le_sqrt = sqrtf(linear_end);
float amount = le_sqrt - ls_sqrt;
float product = 1.0f;
for (int i = 0; i < timesteps; i++) {
float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
product *= 1.0f - powf(beta, 2.0f);
alphas_cumprod[i] = product;
}
}
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
void set_timestep_embedding(struct ggml_tensor* timesteps, struct ggml_tensor* embedding, int dim, int max_period = 10000) {
// timesteps: [N,]
// embedding: [dim, N]
int half = dim / 2;
std::vector<float> freqs(half);
for (int i = 0; i < half; ++i) {
freqs[i] = (float)std::exp(-std::log(max_period) * i / half);
}
for (int i = 0; i < timesteps->ne[0]; ++i) {
for (int j = 0; j < half; ++j) {
float arg = ggml_get_f32_1d(timesteps, i) * freqs[j];
ggml_tensor_set_f32(embedding, std::cos(arg), j, i);
ggml_tensor_set_f32(embedding, std::sin(arg), j + half, i);
}
if (dim % 2 != 0) {
*(float*)((char*)embedding->data + i * embedding->nb[1] + dim * embedding->nb[0]) = 0;
}
}
}
struct ggml_tensor* new_timestep_embedding(struct ggml_context* ctx, struct ggml_allocr* allocr, struct ggml_tensor* timesteps, int dim, int max_period = 10000) {
// timesteps: [N,]
// embedding: [dim, N]
int acutual_dim = dim;
if (dim % 2 != 0) {
acutual_dim = dim + 1;
}
struct ggml_tensor* embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, acutual_dim, timesteps->ne[0]);
if (allocr != NULL) {
ggml_allocr_alloc(allocr, embedding);
}
if (allocr != NULL && !ggml_allocr_is_measure(allocr)) {
set_timestep_embedding(timesteps, embedding, dim, max_period);
}
return embedding;
}
// SPECIAL OPERATIONS WITH TENSORS
uint8_t* sd_tensor_to_image(struct ggml_tensor* input) {
int64_t width = input->ne[0];
int64_t height = input->ne[1];
int64_t channels = input->ne[2];
GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
uint8_t* image_data = (uint8_t*)malloc(width * height * channels);
for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) {
for (int k = 0; k < channels; k++) {
float value = ggml_tensor_get_f32(input, ix, iy, k);
*(image_data + iy * width * channels + ix * channels + k) = (uint8_t)(value * 255.0f);
}
}
}
return image_data;
}
void sd_image_to_tensor(const uint8_t* image_data,
struct ggml_tensor* output) {
int64_t width = output->ne[0];
int64_t height = output->ne[1];
int64_t channels = output->ne[2];
GGML_ASSERT(channels == 3 && output->type == GGML_TYPE_F32);
for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) {
for (int k = 0; k < channels; k++) {
float value = *(image_data + iy * width * channels + ix * channels + k);
ggml_tensor_set_f32(output, value / 255.0f, ix, iy, k);
}
}
}
}
void ggml_split_tensor_2d(struct ggml_tensor* input,
struct ggml_tensor* output,
int x,
int y) {
int64_t width = output->ne[0];
int64_t height = output->ne[1];
int64_t channels = output->ne[2];
GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) {
for (int k = 0; k < channels; k++) {
float value = ggml_tensor_get_f32(input, ix + x, iy + y, k);
ggml_tensor_set_f32(output, value, ix, iy, k);
}
}
}
}
void ggml_merge_tensor_2d(struct ggml_tensor* input,
struct ggml_tensor* output,
int x,
int y,
int overlap) {
int64_t width = input->ne[0];
int64_t height = input->ne[1];
int64_t channels = input->ne[2];
GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
for (int iy = 0; iy < height; iy++) {
for (int ix = 0; ix < width; ix++) {
for (int k = 0; k < channels; k++) {
float new_value = ggml_tensor_get_f32(input, ix, iy, k);
if (overlap > 0) { // blend colors in overlapped area
float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k);
if (x > 0 && ix < overlap) { // in overlapped horizontal
ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (ix / (1.0f * overlap)), x + ix, y + iy, k);
continue;
}
if (y > 0 && iy < overlap) { // in overlapped vertical
ggml_tensor_set_f32(output, old_value + (new_value - old_value) * (iy / (1.0f * overlap)), x + ix, y + iy, k);
continue;
}
}
ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
}
}
}
}
float ggml_tensor_mean(struct ggml_tensor* src) {
float mean = 0.0f;
int64_t nelements = ggml_nelements(src);
float* data = (float*)src->data;
for (int i = 0; i < nelements; i++) {
mean += data[i] / nelements * 1.0f;
}
return mean;
}
// a = a+b
void ggml_tensor_add(struct ggml_tensor* a, struct ggml_tensor* b) {
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
int64_t nelements = ggml_nelements(a);
float* vec_a = (float*)a->data;
float* vec_b = (float*)b->data;
for (int i = 0; i < nelements; i++) {
vec_a[i] = vec_a[i] + vec_b[i];
}
}
void ggml_tensor_scale(struct ggml_tensor* src, float scale) {
int64_t nelements = ggml_nelements(src);
float* data = (float*)src->data;
for (int i = 0; i < nelements; i++) {
data[i] = data[i] * scale;
}
}
void ggml_tensor_clamp(struct ggml_tensor* src, float min, float max) {
int64_t nelements = ggml_nelements(src);
float* data = (float*)src->data;
for (int i = 0; i < nelements; i++) {
float val = data[i];
data[i] = val < min ? min : (val > max ? max : val);
}
}
// convert values from [0, 1] to [-1, 1]
void ggml_tensor_scale_input(struct ggml_tensor* src) {
int64_t nelements = ggml_nelements(src);
float* data = (float*)src->data;
for (int i = 0; i < nelements; i++) {
float val = data[i];
data[i] = val * 2.0f - 1.0f;
}
}
// convert values from [-1, 1] to [0, 1]
void ggml_tensor_scale_output(struct ggml_tensor* src) {
int64_t nelements = ggml_nelements(src);
float* data = (float*)src->data;
for (int i = 0; i < nelements; i++) {
float val = data[i];
data[i] = (val + 1.0f) * 0.5f;
}
}
typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
// Tiling
void sd_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
int input_width = input->ne[0];
int input_height = input->ne[1];
int output_width = output->ne[0];
int output_height = output->ne[1];
GGML_ASSERT(input_width % 2 == 0 && input_height % 2 == 0 && output_width % 2 == 0 && output_height % 2 == 0); // should be multiple of 2
int tile_overlap = (int32_t)(tile_size * tile_overlap_factor);
int non_tile_overlap = tile_size - tile_overlap;
struct ggml_init_params params = {};
params.mem_size += tile_size * tile_size * input->ne[2] * sizeof(float); // input chunk
params.mem_size += (tile_size * scale) * (tile_size * scale) * output->ne[2] * sizeof(float); // output chunk
params.mem_size += 3 * ggml_tensor_overhead();
params.mem_buffer = NULL;
params.no_alloc = false;
LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
// draft context
struct ggml_context* tiles_ctx = ggml_init(params);
if (!tiles_ctx) {
LOG_ERROR("ggml_init() failed");
return;
}
// tiling
ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size, tile_size, input->ne[2], 1);
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1);
on_processing(input_tile, NULL, true);
int num_tiles = (input_width * input_height) / (non_tile_overlap * non_tile_overlap);
LOG_INFO("processing %i tiles", num_tiles);
pretty_progress(1, num_tiles, 0.0f);
int tile_count = 1;
bool last_y = false, last_x = false;
float last_time = 0.0f;
for (int y = 0; y < input_height && !last_y; y += non_tile_overlap) {
if (y + tile_size >= input_height) {
y = input_height - tile_size;
last_y = true;
}
for (int x = 0; x < input_width && !last_x; x += non_tile_overlap) {
if (x + tile_size >= input_width) {
x = input_width - tile_size;
last_x = true;
}
int64_t t1 = ggml_time_ms();
ggml_split_tensor_2d(input, input_tile, x, y);
on_processing(input_tile, output_tile, false);
ggml_merge_tensor_2d(output_tile, output, x * scale, y * scale, tile_overlap * scale);
int64_t t2 = ggml_time_ms();
last_time = (t2 - t1) / 1000.0f;
pretty_progress(tile_count, num_tiles, last_time);
tile_count++;
}
last_x = false;
}
if (tile_count < num_tiles) {
pretty_progress(num_tiles, num_tiles, last_time);
}
}
struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx,
struct ggml_tensor* a) {
return ggml_group_norm(ctx, a, 32);
}
struct ggml_tensor* ggml_nn_linear(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b) {
x = ggml_mul_mat(ctx, w, x);
x = ggml_add(ctx, x, b);
return x;
}
// w: [OCIC, KH, KW]
// x: [N, IC, IH, IW]
// b: [OC,]
// result: [N, OC, OH, OW]
struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b,
int s0 = 1,
int s1 = 1,
int p0 = 0,
int p1 = 0,
int d0 = 1,
int d1 = 1) {
x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
if (b != NULL) {
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
x = ggml_add(ctx, x, b);
}
return x;
}
struct ggml_tensor* ggml_nn_layer_norm(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b,
float eps = EPS) {
x = ggml_norm(ctx, x, eps);
x = ggml_mul(ctx, x, w);
x = ggml_add(ctx, x, b);
return x;
}
struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* w,
struct ggml_tensor* b,
int num_groups = 32) {
if (x->n_dims == 4) {
w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], 1);
b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
}
x = ggml_group_norm(ctx, x, num_groups);
x = ggml_mul(ctx, x, w);
x = ggml_add(ctx, x, b);
return x;
}
std::pair<std::unordered_map<std::string, float>, std::string> extract_and_remove_lora(std::string text) {
std::regex re("<lora:([^:]+):([^>]+)>");
std::smatch matches;
std::unordered_map<std::string, float> filename2multiplier;
while (std::regex_search(text, matches, re)) {
std::string filename = matches[1].str();
float multiplier = std::stof(matches[2].str());
text = std::regex_replace(text, re, "", std::regex_constants::format_first_only);
if (multiplier == 0.f) {
continue;
}
if (filename2multiplier.find(filename) == filename2multiplier.end()) {
filename2multiplier[filename] = multiplier;
} else {
filename2multiplier[filename] += multiplier;
}
}
return std::make_pair(filename2multiplier, text);
}
void ggml_backend_tensor_get_and_sync(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
#ifdef SD_USE_CUBLAS
ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
ggml_backend_synchronize(backend);
#else
ggml_backend_tensor_get(tensor, data, offset, size);
#endif
}
/*================================================== CLIPTokenizer ===================================================*/
const std::string UNK_TOKEN = "<|endoftext|>";
const std::string BOS_TOKEN = "<|startoftext|>";
const std::string EOS_TOKEN = "<|endoftext|>";
const std::string PAD_TOEKN = "<|endoftext|>";
const int UNK_TOKEN_ID = 49407;
const int BOS_TOKEN_ID = 49406;
const int EOS_TOKEN_ID = 49407;
const int PAD_TOKEN_ID = 49407;
std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
std::vector<std::pair<int, std::u32string>> byte_unicode_pairs;
std::set<int> byte_set;
for (int b = static_cast<int>('!'); b <= static_cast<int>('~'); ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
for (int b = 161; b <= 172; ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
for (int b = 174; b <= 255; ++b) {
byte_set.insert(b);
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(b)));
}
int n = 0;
for (int b = 0; b < 256; ++b) {
if (byte_set.find(b) == byte_set.end()) {
byte_unicode_pairs.push_back(std::pair<int, std::u32string>(b, unicode_value_to_utf32(n + 256)));
++n;
}
}
// LOG_DEBUG("byte_unicode_pairs %d", byte_unicode_pairs.size());
return byte_unicode_pairs;
}
// Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
class CLIPTokenizer {
private:
SDVersion version = VERSION_1_x;
std::map<int, std::u32string> byte_encoder;
std::map<std::u32string, int> encoder;
std::map<std::pair<std::u32string, std::u32string>, int> bpe_ranks;
std::regex pat;
static std::string strip(const std::string& str) {
std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f");
std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f");
if (start == std::string::npos) {
// String contains only whitespace characters
return "";
}
return str.substr(start, end - start + 1);
}
static std::string whitespace_clean(std::string text) {
text = std::regex_replace(text, std::regex(R"(\s+)"), " ");
text = strip(text);
return text;
}
static std::set<std::pair<std::u32string, std::u32string>> get_pairs(const std::vector<std::u32string>& subwords) {
std::set<std::pair<std::u32string, std::u32string>> pairs;
if (subwords.size() == 0) {
return pairs;
}
std::u32string prev_subword = subwords[0];
for (int i = 1; i < subwords.size(); i++) {
std::u32string subword = subwords[i];
std::pair<std::u32string, std::u32string> pair(prev_subword, subword);
pairs.insert(pair);
prev_subword = subword;
}
return pairs;
}
public:
CLIPTokenizer(SDVersion version = VERSION_1_x)
: version(version) {}
void load_from_merges(const std::string& merges_utf8_str) {
auto byte_unicode_pairs = bytes_to_unicode();
byte_encoder = std::map<int, std::u32string>(byte_unicode_pairs.begin(), byte_unicode_pairs.end());
// for (auto & pair: byte_unicode_pairs) {
// std::cout << pair.first << ": " << pair.second << std::endl;
// }
std::vector<std::u32string> merges;
size_t start = 0;
size_t pos;
std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str);
while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) {
merges.push_back(merges_utf32_str.substr(start, pos - start));
start = pos + 1;
}
// LOG_DEBUG("merges size %llu", merges.size());
GGML_ASSERT(merges.size() == 48895);
merges = std::vector<std::u32string>(merges.begin() + 1, merges.end());
std::vector<std::pair<std::u32string, std::u32string>> merge_pairs;
for (const auto& merge : merges) {
size_t space_pos = merge.find(' ');
merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1));
// LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str());
}
std::vector<std::u32string> vocab;
for (const auto& pair : byte_unicode_pairs) {
vocab.push_back(pair.second);
}
for (const auto& pair : byte_unicode_pairs) {
vocab.push_back(pair.second + utf8_to_utf32("</w>"));
}
for (const auto& merge : merge_pairs) {
vocab.push_back(merge.first + merge.second);
}
vocab.push_back(utf8_to_utf32("<|startoftext|>"));
vocab.push_back(utf8_to_utf32("<|endoftext|>"));
LOG_DEBUG("vocab size: %llu", vocab.size());
int i = 0;
for (const auto& token : vocab) {
encoder[token] = i++;
}
int rank = 0;
for (const auto& merge : merge_pairs) {
bpe_ranks[merge] = rank++;
}
};
std::u32string bpe(const std::u32string& token) {
std::vector<std::u32string> word;
for (int i = 0; i < token.size() - 1; i++) {
word.emplace_back(1, token[i]);
}
word.push_back(token.substr(token.size() - 1) + utf8_to_utf32("</w>"));
std::set<std::pair<std::u32string, std::u32string>> pairs = get_pairs(word);
if (pairs.empty()) {
return token + utf8_to_utf32("</w>");
}
while (true) {
auto min_pair_iter = std::min_element(pairs.begin(),
pairs.end(),
[&](const std::pair<std::u32string, std::u32string>& a,
const std::pair<std::u32string, std::u32string>& b) {
if (bpe_ranks.find(a) == bpe_ranks.end()) {
return false;
} else if (bpe_ranks.find(b) == bpe_ranks.end()) {
return true;
}
return bpe_ranks.at(a) < bpe_ranks.at(b);
});
const std::pair<std::u32string, std::u32string>& bigram = *min_pair_iter;
if (bpe_ranks.find(bigram) == bpe_ranks.end()) {
break;
}
std::u32string first = bigram.first;
std::u32string second = bigram.second;
std::vector<std::u32string> new_word;
int32_t i = 0;
while (i < word.size()) {
auto it = std::find(word.begin() + i, word.end(), first);
if (it == word.end()) {
new_word.insert(new_word.end(), word.begin() + i, word.end());
break;
}
new_word.insert(new_word.end(), word.begin() + i, it);
i = static_cast<int32_t>(std::distance(word.begin(), it));
if (word[i] == first && i < static_cast<int32_t>(word.size()) - 1 && word[i + 1] == second) {
new_word.push_back(first + second);
i += 2;
} else {
new_word.push_back(word[i]);
i += 1;
}
}
word = new_word;
if (word.size() == 1) {
break;
}
pairs = get_pairs(word);
}
std::u32string result;
for (int i = 0; i < word.size(); i++) {
result += word[i];
if (i != word.size() - 1) {
result += utf8_to_utf32(" ");
}
}
return result;
}
std::vector<int> tokenize(std::string text, size_t max_length = 0, bool padding = false) {
std::vector<int32_t> tokens = encode(text);
tokens.insert(tokens.begin(), BOS_TOKEN_ID);
if (max_length > 0) {
if (tokens.size() > max_length - 1) {
tokens.resize(max_length - 1);
tokens.push_back(EOS_TOKEN_ID);
} else {
tokens.push_back(EOS_TOKEN_ID);
if (padding) {
int pad_token_id = PAD_TOKEN_ID;
if (version == VERSION_2_x) {
pad_token_id = 0;
}
tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
}
}
}
return tokens;
}
std::vector<int> encode(std::string text) {
std::string original_text = text;
std::vector<int32_t> bpe_tokens;
text = whitespace_clean(text);
std::transform(text.begin(), text.end(), text.begin(), [](unsigned char c) { return std::tolower(c); });
std::regex pat(R"(<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[[:alpha:]]+|[[:digit:]]|[^[:space:][:alpha:][:digit:]]+)",
std::regex::icase);
std::smatch matches;
std::string str = text;
std::vector<std::string> token_strs;
while (std::regex_search(str, matches, pat)) {
for (auto& token : matches) {
std::string token_str = token.str();
std::u32string utf32_token;
for (int i = 0; i < token_str.length(); i++) {
char b = token_str[i];
utf32_token += byte_encoder[b];
}
auto bpe_strs = bpe(utf32_token);
size_t start = 0;
size_t pos;
while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) {
auto bpe_str = bpe_strs.substr(start, pos - start);
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
start = pos + 1;
}
auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start);
bpe_tokens.push_back(encoder[bpe_str]);
token_strs.push_back(utf32_to_utf8(bpe_str));
}
str = matches.suffix();
}
std::stringstream ss;
ss << "[";
for (auto token : token_strs) {
ss << "\"" << token << "\", ";
}
ss << "]";
LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str());
return bpe_tokens;
}
};
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/prompt_parser.py#L345
//
// Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
// Accepted tokens are:
// (abc) - increases attention to abc by a multiplier of 1.1
// (abc:3.12) - increases attention to abc by a multiplier of 3.12
// [abc] - decreases attention to abc by a multiplier of 1.1
// \( - literal character '('
// \[ - literal character '['
// \) - literal character ')'
// \] - literal character ']'
// \\ - literal character '\'
// anything else - just text
//
// >>> parse_prompt_attention('normal text')
// [['normal text', 1.0]]
// >>> parse_prompt_attention('an (important) word')
// [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
// >>> parse_prompt_attention('(unbalanced')
// [['unbalanced', 1.1]]
// >>> parse_prompt_attention('\(literal\]')
// [['(literal]', 1.0]]
// >>> parse_prompt_attention('(unnecessary)(parens)')
// [['unnecessaryparens', 1.1]]
// >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
// [['a ', 1.0],
// ['house', 1.5730000000000004],
// [' ', 1.1],
// ['on', 1.0],
// [' a ', 1.1],
// ['hill', 0.55],
// [', sun, ', 1.1],
// ['sky', 1.4641000000000006],
// ['.', 1.1]]
std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text) {
std::vector<std::pair<std::string, float>> res;
std::vector<int> round_brackets;
std::vector<int> square_brackets;
float round_bracket_multiplier = 1.1f;
float square_bracket_multiplier = 1 / 1.1f;
std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
std::regex re_break(R"(\s*\bBREAK\b\s*)");
auto multiply_range = [&](int start_position, float multiplier) {
for (int p = start_position; p < res.size(); ++p) {
res[p].second *= multiplier;
}
};
std::smatch m;
std::string remaining_text = text;
while (std::regex_search(remaining_text, m, re_attention)) {
std::string text = m[0];
std::string weight = m[1];
if (text == "(") {
round_brackets.push_back((int)res.size());
} else if (text == "[") {
square_brackets.push_back((int)res.size());
} else if (!weight.empty()) {
if (!round_brackets.empty()) {
multiply_range(round_brackets.back(), std::stof(weight));
round_brackets.pop_back();
}
} else if (text == ")" && !round_brackets.empty()) {
multiply_range(round_brackets.back(), round_bracket_multiplier);
round_brackets.pop_back();
} else if (text == "]" && !square_brackets.empty()) {
multiply_range(square_brackets.back(), square_bracket_multiplier);
square_brackets.pop_back();
} else if (text == "\\(") {
res.push_back({text.substr(1), 1.0f});
} else {
res.push_back({text, 1.0f});
}
remaining_text = m.suffix();
}
for (int pos : round_brackets) {
multiply_range(pos, round_bracket_multiplier);
}
for (int pos : square_brackets) {
multiply_range(pos, square_bracket_multiplier);
}
if (res.empty()) {
res.push_back({"", 1.0f});
}
int i = 0;
while (i + 1 < res.size()) {
if (res[i].second == res[i + 1].second) {
res[i].first += res[i + 1].first;
res.erase(res.begin() + i + 1);
} else {
++i;
}
}
return res;
}
/*================================================ FrozenCLIPEmbedder ================================================*/
struct ResidualAttentionBlock {
int32_t n_head;
int32_t d_model;
int32_t hidden_size; // n_head * d_model
int32_t intermediate_size;
// attention
struct ggml_tensor* q_w; // [hidden_size, hidden_size]
struct ggml_tensor* q_b; // [hidden_size, ]
struct ggml_tensor* k_w; // [hidden_size, hidden_size]
struct ggml_tensor* k_b; // [hidden_size, ]
struct ggml_tensor* v_w; // [hidden_size, hidden_size]
struct ggml_tensor* v_b; // [hidden_size, ]
struct ggml_tensor* out_w; // [hidden_size, hidden_size]
struct ggml_tensor* out_b; // [hidden_size, ]
// layer norm 1
struct ggml_tensor* ln1_w; // [hidden_size, ]
struct ggml_tensor* ln1_b; // [hidden_size, ]
// mlp
struct ggml_tensor* fc1_w; // [intermediate_size, hidden_size]
struct ggml_tensor* fc1_b; // [intermediate_size, ]
struct ggml_tensor* fc2_w; // [hidden_size, intermediate_size]
struct ggml_tensor* fc2_b; // [hidden_size, ]
// layer norm 2
struct ggml_tensor* ln2_w; // [hidden_size, ]
struct ggml_tensor* ln2_b; // [hidden_size, ]
struct ggml_tensor* attn_scale; // [hidden_size, ]
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 4 * hidden_size * hidden_size * ggml_type_sizef(wtype); // q_w/k_w/v_w/out_w
mem_size += 8 * hidden_size * ggml_type_sizef(GGML_TYPE_F32); // q_b/k_b/v_b/out_b/ln1_w/ln1_b/ln2_w/ln2_b
mem_size += 2 * hidden_size * intermediate_size * ggml_type_sizef(wtype); // fc1_w/fc2_w
mem_size += intermediate_size * ggml_type_sizef(GGML_TYPE_F32); // fc1_b
mem_size += hidden_size * ggml_type_sizef(GGML_TYPE_F32); // fc2_b
mem_size += ggml_type_sizef(GGML_TYPE_F32); // attn_scale
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
ln1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
ln1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
q_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
k_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
v_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
out_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, hidden_size);
out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
fc1_w = ggml_new_tensor_2d(ctx, wtype, hidden_size, intermediate_size);
fc1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, intermediate_size);
fc2_w = ggml_new_tensor_2d(ctx, wtype, intermediate_size, hidden_size);
fc2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
ln2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
ln2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
attn_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, attn_scale);
float scale = 1.0f / sqrt((float)d_model);
ggml_backend_tensor_set(attn_scale, &scale, 0, sizeof(scale));
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "self_attn.q_proj.weight"] = q_w;
tensors[prefix + "self_attn.q_proj.bias"] = q_b;
tensors[prefix + "self_attn.k_proj.weight"] = k_w;
tensors[prefix + "self_attn.k_proj.bias"] = k_b;
tensors[prefix + "self_attn.v_proj.weight"] = v_w;
tensors[prefix + "self_attn.v_proj.bias"] = v_b;
tensors[prefix + "self_attn.out_proj.weight"] = out_w;
tensors[prefix + "self_attn.out_proj.bias"] = out_b;
tensors[prefix + "layer_norm1.weight"] = ln1_w;
tensors[prefix + "layer_norm1.bias"] = ln1_b;
tensors[prefix + "layer_norm2.weight"] = ln2_w;
tensors[prefix + "layer_norm2.bias"] = ln2_b;
tensors[prefix + "mlp.fc1.weight"] = fc1_w;
tensors[prefix + "mlp.fc1.bias"] = fc1_b;
tensors[prefix + "mlp.fc2.weight"] = fc2_w;
tensors[prefix + "mlp.fc2.bias"] = fc2_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, n_token, hidden_size]
int64_t N = x->ne[2];
int64_t n_token = x->ne[1];
int64_t hidden_size = n_head * d_model;
struct ggml_tensor* r = x;
// layer norm 1
x = ggml_nn_layer_norm(ctx, x, ln1_w, ln1_b);
// self-attention
{
struct ggml_tensor* q = ggml_nn_linear(ctx, x, q_w, q_b);
q = ggml_scale_inplace(ctx, q, attn_scale);
q = ggml_reshape_4d(ctx, q, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model]
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, n_token, d_model]
q = ggml_reshape_3d(ctx, q, d_model, n_token, n_head * N); // [N * n_head, n_token, d_model]
struct ggml_tensor* k = ggml_nn_linear(ctx, x, k_w, k_b);
k = ggml_reshape_4d(ctx, k, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model]
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, n_token, d_model]
k = ggml_reshape_3d(ctx, k, d_model, n_token, n_head); // [N * n_head, n_token, d_model]
struct ggml_tensor* v = ggml_nn_linear(ctx, x, v_w, v_b);
v = ggml_reshape_4d(ctx, v, d_model, n_head, n_token, N); // [N, n_token, n_head, d_model]
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_model, n_token]
v = ggml_reshape_3d(ctx, v, n_token, d_model, n_head * N); // [N * n_head, d_model, n_token]
struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, n_token, n_token]
kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
kq = ggml_soft_max_inplace(ctx, kq);
struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, n_token, d_model]
kqv = ggml_reshape_4d(ctx, kqv, d_model, n_token, n_head, N);
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, n_token, n_head, d_model]
x = ggml_reshape_2d(ctx, kqv, d_model * n_head, n_token * N); // // [N * n_token, d_model * n_head]
}
// attention output
x = ggml_nn_linear(ctx, x, out_w, out_b);
// residual
x = ggml_add(ctx, x, r);
r = x;
// layer norm 2
x = ggml_nn_layer_norm(ctx, x, ln2_w, ln2_b);
// mlp
x = ggml_nn_linear(ctx, x, fc1_w, fc1_b);
if (hidden_size == 1024 || hidden_size == 1280) { // SD 2.x
x = ggml_gelu_inplace(ctx, x);
} else { // SD 1.x
x = ggml_gelu_quick_inplace(ctx, x);
}
x = ggml_nn_linear(ctx, x, fc2_w, fc2_b);
// residual 2
x = ggml_add(ctx, x, r);
return x;
}
};
// OPENAI_CLIP_VIT_L_14: https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
// OPEN_CLIP_VIT_H_14: https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/config.json
// OPEN_CLIP_VIT_BIGG_14: https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k/blob/main/config.json (CLIPTextModelWithProjection)
// SDXL CLIPModel
// CLIPTextModelWithProjection seems optional
enum CLIPVersion {
OPENAI_CLIP_VIT_L_14, // SD 1.x and SDXL
OPEN_CLIP_VIT_H_14, // SD 2.x
OPEN_CLIP_VIT_BIGG_14, // SDXL
};
struct CLIPTextModel {
CLIPVersion version = OPENAI_CLIP_VIT_L_14;
// network hparams
int32_t vocab_size = 49408;
int32_t max_position_embeddings = 77;
int32_t hidden_size = 768; // 1024 for OPEN_CLIP_VIT_H_14
int32_t intermediate_size = 3072; // 4096 for OPEN_CLIP_VIT_H_14
int32_t n_head = 12; // num_attention_heads, 16 for OPEN_CLIP_VIT_H_14
int32_t num_hidden_layers = 12; // 24 for OPEN_CLIP_VIT_H_14
int32_t layer_idx = 11;
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
bool with_final_ln = true;
// embeddings
struct ggml_tensor* position_ids;
struct ggml_tensor* token_embed_weight;
struct ggml_tensor* position_embed_weight;
// transformer
std::vector<ResidualAttentionBlock> resblocks;
struct ggml_tensor* final_ln_w;
struct ggml_tensor* final_ln_b;
struct ggml_tensor* text_projection;
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
int clip_skip = 1,
bool with_final_ln = true)
: version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024;
intermediate_size = 4096;
n_head = 16;
num_hidden_layers = 24;
} else if (version == OPEN_CLIP_VIT_BIGG_14) { // CLIPTextModelWithProjection
hidden_size = 1280;
intermediate_size = 5120;
n_head = 20;
num_hidden_layers = 32;
}
layer_idx = num_hidden_layers - clip_skip;
resblocks.resize(num_hidden_layers);
set_resblocks_hp_params();
}
void set_resblocks_hp_params() {
int d_model = hidden_size / n_head; // 64 / SDXL is 40 for CLIPTextModelWithProjection
for (int i = 0; i < num_hidden_layers; i++) {
resblocks[i].d_model = d_model;
resblocks[i].n_head = n_head;
resblocks[i].hidden_size = hidden_size;
resblocks[i].intermediate_size = intermediate_size;
}
}
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += hidden_size * max_position_embeddings * ggml_type_sizef(GGML_TYPE_I32); // position_ids
mem_size += hidden_size * vocab_size * ggml_type_sizef(wtype); // token_embed_weight
mem_size += hidden_size * max_position_embeddings * ggml_type_sizef(wtype); // position_embed_weight
for (int i = 0; i < num_hidden_layers; i++) {
mem_size += resblocks[i].calculate_mem_size(wtype);
}
mem_size += 2 * hidden_size * ggml_type_sizef(GGML_TYPE_F32); // final_ln_w/b
if (version == OPEN_CLIP_VIT_BIGG_14) {
mem_size += hidden_size * projection_dim * ggml_type_sizef(GGML_TYPE_F32); // text_projection
}
return static_cast<size_t>(mem_size);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "embeddings.token_embedding.weight"] = token_embed_weight;
tensors[prefix + "embeddings.position_embedding.weight"] = position_embed_weight;
tensors[prefix + "final_layer_norm.weight"] = final_ln_w;
tensors[prefix + "final_layer_norm.bias"] = final_ln_b;
for (int i = 0; i < num_hidden_layers; i++) {
std::string name = prefix + "encoder.layers." + std::to_string(i) + ".";
resblocks[i].map_by_name(tensors, prefix + "encoder.layers." + std::to_string(i) + ".");
}
if (version == OPEN_CLIP_VIT_BIGG_14) {
tensors[prefix + "text_projection"] = text_projection;
}
}
struct ggml_tensor* forward(struct ggml_context* ctx0, struct ggml_tensor* input_ids, uint32_t max_token_idx = 0, bool return_pooled = false) {
// input_ids: [N, n_token]
GGML_ASSERT(input_ids->ne[0] <= position_ids->ne[0]);
// token_embedding + position_embedding
struct ggml_tensor* x;
x = ggml_add(ctx0,
ggml_get_rows(ctx0, token_embed_weight, input_ids),
ggml_get_rows(ctx0,
position_embed_weight,
ggml_view_1d(ctx0, position_ids, input_ids->ne[0], 0))); // [N, n_token, hidden_size]
// transformer
for (int i = 0; i < num_hidden_layers; i++) {
if (!return_pooled && i == layer_idx + 1) {
// LOG_DEBUG("layer %d", i);
break;
}
x = resblocks[i].forward(ctx0, x); // [N, n_token, hidden_size]
}
// final layer norm
if (return_pooled || with_final_ln) {
x = ggml_nn_layer_norm(ctx0, x, final_ln_w, final_ln_b);
}
if (return_pooled) {
// ggml_tensor* idx = ggml_argmax(ctx0, input_ids);
// ggml_tensor* pooled = ggml_get_rows(ctx0, x, idx);
// LOG_DEBUG("max_token_idx: %u %u", max_token_idx, x->nb[1]);
ggml_tensor* pooled = ggml_view_1d(ctx0, x, hidden_size, x->nb[1] * max_token_idx);
pooled = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, text_projection)), pooled);
return pooled;
}
return x; // [N, n_token, hidden_size]
}
void alloc_params(ggml_context* ctx, ggml_backend_t backend, ggml_type wtype, ggml_allocr* alloc) {
position_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, max_position_embeddings);
token_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, vocab_size);
position_embed_weight = ggml_new_tensor_2d(ctx, wtype, hidden_size, max_position_embeddings);
for (int i = 0; i < num_hidden_layers; i++) {
resblocks[i].init_params(ctx, alloc, wtype);
}
final_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
final_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
if (version == OPEN_CLIP_VIT_BIGG_14) {
text_projection = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, projection_dim, hidden_size);
}
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
if (ggml_backend_is_cpu(backend)) {
for (int i = 0; i < max_position_embeddings; i++) {
ggml_set_i32_1d(position_ids, i, i);
}
} else {
std::vector<int> pos_temp;
for (int i = 0; i < max_position_embeddings; i++) {
pos_temp.push_back(i);
}
ggml_backend_tensor_set(position_ids, pos_temp.data(), 0, ggml_nbytes(position_ids));
}
}
};
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
struct FrozenCLIPEmbedder {
CLIPTokenizer tokenizer;
CLIPTextModel text_model;
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_allocr* allocr, const std::string& prompt) {
std::vector<int32_t> tokens = tokenizer.tokenize(prompt, text_model.max_position_embeddings, true);
struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens.size());
memcpy(input_ids->data, tokens.data(), tokens.size() * ggml_element_size(input_ids));
struct ggml_tensor* hidden_states = text_model.forward(ctx, input_ids);
return hidden_states;
}
};
// Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
struct FrozenCLIPEmbedderWithCustomWords {
SDVersion version = VERSION_1_x;
CLIPTokenizer tokenizer;
CLIPTextModel text_model;
CLIPTextModel text_model2;
// context and memory buffers
struct ggml_context* ctx = NULL;
ggml_backend_buffer_t params_buffer = NULL;
ggml_backend_buffer_t compute_buffer = NULL;; // for compute
struct ggml_allocr* compute_alloc = NULL;
size_t compute_memory_buffer_size = -1;
size_t memory_buffer_size = 0;
ggml_type wtype;
ggml_backend_t backend = NULL;
ggml_tensor* hidden_state_output = NULL;
ggml_tensor* pooled_output = NULL;
FrozenCLIPEmbedderWithCustomWords(SDVersion version = VERSION_1_x, int clip_skip = -1)
: version(version), tokenizer(version) {
if (clip_skip <= 0) {
clip_skip = 1;
if (version == VERSION_2_x || version == VERSION_XL) {
clip_skip = 2;
}
}
if (version == VERSION_1_x) {
text_model = CLIPTextModel(OPENAI_CLIP_VIT_L_14, clip_skip);
} else if (version == VERSION_2_x) {
text_model = CLIPTextModel(OPEN_CLIP_VIT_H_14, clip_skip);
} else if (version == VERSION_XL) {
text_model = CLIPTextModel(OPENAI_CLIP_VIT_L_14, clip_skip, false);
text_model2 = CLIPTextModel(OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
}
}
size_t calculate_mem_size() {
size_t mem_size = text_model.calculate_mem_size(wtype);
if (version == VERSION_XL) {
mem_size += text_model2.calculate_mem_size(wtype);
}
return mem_size;
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
text_model.map_by_name(tensors, prefix + "transformer.text_model.");
if (version == VERSION_XL) {
text_model2.map_by_name(tensors, prefix + "1.transformer.text_model.");
}
}
struct ggml_tensor* forward(struct ggml_context* ctx0, struct ggml_tensor* input_ids, struct ggml_tensor* input_ids2, uint32_t max_token_idx = 0, bool return_pooled = false) {
if (return_pooled) {
return text_model2.forward(ctx0, input_ids2, max_token_idx, return_pooled);
}
auto hidden_states = text_model.forward(ctx0, input_ids); // [N, n_token, hidden_size]
// LOG_DEBUG("hidden_states: %d %d %d %d %d", hidden_states->n_dims, hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
if (version == VERSION_XL) {
hidden_states = ggml_reshape_4d(ctx0,
hidden_states,
hidden_states->ne[0],
hidden_states->ne[1],
hidden_states->ne[2],
hidden_states->ne[3]);
hidden_states = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states, 2, 0, 1, 3));
auto hidden_states2 = text_model2.forward(ctx0, input_ids2); // [N, n_token, hidden_size2]
hidden_states2 = ggml_reshape_4d(ctx0,
hidden_states2,
hidden_states2->ne[0],
hidden_states2->ne[1],
hidden_states2->ne[2],
hidden_states2->ne[3]);
hidden_states2 = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states2, 2, 0, 1, 3));
hidden_states = ggml_concat(ctx0, hidden_states, hidden_states2); // [N, n_token, hidden_size + hidden_size2]
hidden_states = ggml_cont(ctx0, ggml_permute(ctx0, hidden_states, 1, 2, 0, 3));
}
// LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
return hidden_states;
}
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
bool padding = false) {
return tokenize(text, text_model.max_position_embeddings, padding);
}
std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
size_t max_length = 0,
bool padding = false) {
auto parsed_attention = parse_prompt_attention(text);
{
std::stringstream ss;
ss << "[";
for (const auto& item : parsed_attention) {
ss << "['" << item.first << "', " << item.second << "], ";
}
ss << "]";
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
}
std::vector<int> tokens;
std::vector<float> weights;
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = tokenizer.encode(curr_text);
tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
weights.insert(weights.end(), curr_tokens.size(), curr_weight);
}
tokens.insert(tokens.begin(), BOS_TOKEN_ID);
weights.insert(weights.begin(), 1.0);
if (max_length > 0) {
if (tokens.size() > max_length - 1) {
tokens.resize(max_length - 1);
weights.resize(max_length - 1);
tokens.push_back(EOS_TOKEN_ID);
weights.push_back(1.0);
} else {
tokens.push_back(EOS_TOKEN_ID);
weights.push_back(1.0);
if (padding) {
int pad_token_id = PAD_TOKEN_ID;
if (version == VERSION_2_x) {
pad_token_id = 0;
}
tokens.insert(tokens.end(), max_length - tokens.size(), pad_token_id);
weights.insert(weights.end(), max_length - weights.size(), 1.0);
}
}
}
// for (int i = 0; i < tokens.size(); i++) {
// std::cout << tokens[i] << ":" << weights[i] << ", ";
// }
// std::cout << std::endl;
return {tokens, weights};
}
bool initialize(ggml_backend_t backend_, ggml_type wtype_) {
backend = backend_;
wtype = wtype_;
memory_buffer_size = 1 * 1024 * 1024; // 1 MB, for padding
memory_buffer_size += calculate_mem_size();
int num_tensors = (3 + 2 + 37 * text_model.num_hidden_layers);
if (version == VERSION_XL) {
num_tensors += (3 + 2 + 37 * text_model2.num_hidden_layers);
}
LOG_DEBUG("clip params backend buffer size = % 6.2f MB (%i tensors)", memory_buffer_size / (1024.0 * 1024.0), num_tensors);
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(num_tensors * ggml_tensor_overhead());
params.mem_buffer = NULL;
params.no_alloc = true;
ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
params_buffer = ggml_backend_alloc_buffer(backend, memory_buffer_size);
return true;
}
void destroy() {
if (ctx != NULL) {
ggml_free(ctx);
ctx = NULL;
}
if (params_buffer != NULL) {
ggml_backend_buffer_free(params_buffer);
params_buffer = NULL;
}
}
void alloc_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
text_model.alloc_params(ctx, backend, wtype, alloc);
if (version == VERSION_XL) {
text_model2.alloc_params(ctx, backend, wtype, alloc);
}
ggml_allocr_free(alloc);
}
struct ggml_cgraph* build_graph(struct ggml_allocr* allocr, std::vector<int> tokens, bool return_pooled = false) {
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph(ctx0);
struct ggml_tensor* input_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, tokens.size());
ggml_allocr_alloc(allocr, input_ids);
if (!ggml_allocr_is_measure(allocr)) {
ggml_backend_tensor_set(input_ids, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids));
}
struct ggml_tensor* input_ids2 = NULL;
size_t max_token_idx = 0;
if (version == VERSION_XL) {
input_ids2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, tokens.size());
ggml_allocr_alloc(allocr, input_ids2);
auto it = std::find(tokens.begin(), tokens.end(), EOS_TOKEN_ID);
if (it != tokens.end()) {
std::fill(std::next(it), tokens.end(), 0);
}
max_token_idx = std::min<size_t>(std::distance(tokens.begin(), it), tokens.size() - 1);
// for (int i = 0; i < tokens.size(); i++) {
// printf("%d ", tokens[i]);
// }
// printf("\n");
if (!ggml_allocr_is_measure(allocr)) {
ggml_backend_tensor_set(input_ids2, tokens.data(), 0, tokens.size() * ggml_element_size(input_ids2));
}
}
struct ggml_tensor* hidden_states = forward(ctx0, input_ids, input_ids2, max_token_idx, return_pooled);
ggml_build_forward_expand(gf, hidden_states);
ggml_free(ctx0);
return gf;
}
void begin(ggml_context* work_ctx, int max_tokens) {
if (hidden_state_output == NULL) {
size_t total_hidden_size = text_model.hidden_size;
if (version == VERSION_XL) {
total_hidden_size += text_model2.hidden_size;
}
hidden_state_output = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, total_hidden_size, text_model.max_position_embeddings);
pooled_output = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, text_model2.projection_dim);
}
// calculate the amount of memory required
if (compute_memory_buffer_size == -1) {
compute_alloc = ggml_allocr_new_measure_from_backend(backend);
bool return_pooled = false;
if (version == VERSION_XL) {
return_pooled = true;
}
struct ggml_cgraph* gf = build_graph(compute_alloc, std::vector<int>(max_tokens), return_pooled);
// compute the required memory
compute_memory_buffer_size = ggml_allocr_alloc_graph(compute_alloc, gf) + 1024 * 1024;
// recreate the allocator with the required memory
ggml_allocr_free(compute_alloc);
LOG_DEBUG("learned condition compute buffer size: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
}
compute_buffer = ggml_backend_alloc_buffer(backend, compute_memory_buffer_size);
compute_alloc = ggml_allocr_new_from_buffer(compute_buffer);
}
std::pair<struct ggml_tensor*, struct ggml_tensor*> compute(const int n_threads, std::vector<int> tokens) {
struct ggml_cgraph* gf = build_graph(compute_alloc, tokens);
ggml_allocr_alloc_graph(compute_alloc, gf);
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
#ifdef SD_USE_METAL
if (ggml_backend_is_metal(backend)) {
ggml_backend_metal_set_n_cb(backend, n_threads);
}
#endif
ggml_backend_graph_compute(backend, gf);
#ifdef GGML_PERF
ggml_graph_print(gf);
#endif
ggml_backend_tensor_get(gf->nodes[gf->n_nodes - 1], hidden_state_output->data, 0, ggml_nbytes(hidden_state_output));
if (version == VERSION_XL) {
struct ggml_cgraph* gf = build_graph(compute_alloc, tokens, true);
ggml_allocr_alloc_graph(compute_alloc, gf);
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
ggml_backend_graph_compute(backend, gf);
#ifdef GGML_PERF
ggml_graph_print(gf);
#endif
ggml_backend_tensor_get(gf->nodes[gf->n_nodes - 1], pooled_output->data, 0, ggml_nbytes(pooled_output));
return {hidden_state_output, pooled_output};
}
return {hidden_state_output, NULL};
}
void end() {
ggml_allocr_free(compute_alloc);
ggml_backend_buffer_free(compute_buffer);
compute_alloc = NULL;
compute_memory_buffer_size = -1;
hidden_state_output = NULL;
pooled_output = NULL;
}
};
/*==================================================== UnetModel =====================================================*/
struct ResBlock {
// network hparams
int channels; // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
int emb_channels; // time_embed_dim
int out_channels; // mult * model_channels
// network params
// in_layers
struct ggml_tensor* in_layer_0_w; // [channels, ]
struct ggml_tensor* in_layer_0_b; // [channels, ]
// in_layer_1 is nn.SILU()
struct ggml_tensor* in_layer_2_w; // [out_channels, channels, 3, 3]
struct ggml_tensor* in_layer_2_b; // [out_channels, ]
// emb_layers
// emb_layer_0 is nn.SILU()
struct ggml_tensor* emb_layer_1_w; // [out_channels, emb_channels]
struct ggml_tensor* emb_layer_1_b; // [out_channels, ]
// out_layers
struct ggml_tensor* out_layer_0_w; // [out_channels, ]
struct ggml_tensor* out_layer_0_b; // [out_channels, ]
// out_layer_1 is nn.SILU()
// out_layer_2 is nn.Dropout(), p = 0 for inference
struct ggml_tensor* out_layer_3_w; // [out_channels, out_channels, 3, 3]
struct ggml_tensor* out_layer_3_b; // [out_channels, ]
// skip connection, only if out_channels != channels
struct ggml_tensor* skip_w; // [out_channels, channels, 1, 1]
struct ggml_tensor* skip_b; // [out_channels, ]
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 2 * channels * ggml_type_sizef(GGML_TYPE_F32); // in_layer_0_w/b
mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // in_layer_2_w
mem_size += 5 * out_channels * ggml_type_sizef(GGML_TYPE_F32); // in_layer_2_b/emb_layer_1_b/out_layer_0_w/out_layer_0_b/out_layer_3_b
mem_size += out_channels * emb_channels * ggml_type_sizef(wtype); // emb_layer_1_w
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // out_layer_3_w
if (out_channels != channels) {
mem_size += out_channels * channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // skip_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // skip_b
}
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
in_layer_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
in_layer_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
in_layer_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
in_layer_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
emb_layer_1_w = ggml_new_tensor_2d(ctx, wtype, emb_channels, out_channels);
emb_layer_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
out_layer_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
out_layer_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
out_layer_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
out_layer_3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
if (out_channels != channels) {
skip_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, channels, out_channels);
skip_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "in_layers.0.weight"] = in_layer_0_w;
tensors[prefix + "in_layers.0.bias"] = in_layer_0_b;
tensors[prefix + "in_layers.2.weight"] = in_layer_2_w;
tensors[prefix + "in_layers.2.bias"] = in_layer_2_b;
tensors[prefix + "emb_layers.1.weight"] = emb_layer_1_w;
tensors[prefix + "emb_layers.1.bias"] = emb_layer_1_b;
tensors[prefix + "out_layers.0.weight"] = out_layer_0_w;
tensors[prefix + "out_layers.0.bias"] = out_layer_0_b;
tensors[prefix + "out_layers.3.weight"] = out_layer_3_w;
tensors[prefix + "out_layers.3.bias"] = out_layer_3_b;
if (out_channels != channels) {
tensors[prefix + "skip_connection.weight"] = skip_w;
tensors[prefix + "skip_connection.bias"] = skip_b;
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb) {
// x: [N, channels, h, w]
// emb: [N, emb_channels]
// in_layers
auto h = ggml_nn_group_norm(ctx, x, in_layer_0_w, in_layer_0_b);
h = ggml_silu_inplace(ctx, h);
h = ggml_nn_conv_2d(ctx, h, in_layer_2_w, in_layer_2_b, 1, 1, 1, 1); // [N, out_channels, h, w]
// emb_layers
auto emb_out = ggml_silu(ctx, emb);
emb_out = ggml_nn_linear(ctx, emb_out, emb_layer_1_w, emb_layer_1_b); // [N, out_channels]
emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
// out_layers
h = ggml_add(ctx, h, emb_out);
h = ggml_nn_group_norm(ctx, h, out_layer_0_w, out_layer_0_b);
h = ggml_silu_inplace(ctx, h);
// dropout, skip for inference
h = ggml_nn_conv_2d(ctx, h, out_layer_3_w, out_layer_3_b, 1, 1, 1, 1); // [N, out_channels, h, w]
// skip connection
if (out_channels != channels) {
x = ggml_nn_conv_2d(ctx, x, skip_w, skip_b); // [N, out_channels, h, w]
}
h = ggml_add(ctx, h, x);
return h; // [N, out_channels, h, w]
}
};
struct SpatialTransformer {
int in_channels; // mult * model_channels
int n_head; // num_heads
int d_head; // in_channels // n_heads
int depth = 1; // 1
int context_dim = 768; // hidden_size, 1024 for VERSION_2_x
// group norm
struct ggml_tensor* norm_w; // [in_channels,]
struct ggml_tensor* norm_b; // [in_channels,]
// proj_in
struct ggml_tensor* proj_in_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* proj_in_b; // [in_channels,]
// transformer
struct Transformer {
// layer norm 1
struct ggml_tensor* norm1_w; // [in_channels, ]
struct ggml_tensor* norm1_b; // [in_channels, ]
// attn1
struct ggml_tensor* attn1_q_w; // [in_channels, in_channels]
struct ggml_tensor* attn1_k_w; // [in_channels, in_channels]
struct ggml_tensor* attn1_v_w; // [in_channels, in_channels]
struct ggml_tensor* attn1_out_w; // [in_channels, in_channels]
struct ggml_tensor* attn1_out_b; // [in_channels, ]
// layer norm 2
struct ggml_tensor* norm2_w; // [in_channels, ]
struct ggml_tensor* norm2_b; // [in_channels, ]
// attn2
struct ggml_tensor* attn2_q_w; // [in_channels, in_channels]
struct ggml_tensor* attn2_k_w; // [in_channels, context_dim]
struct ggml_tensor* attn2_v_w; // [in_channels, context_dim]
struct ggml_tensor* attn2_out_w; // [in_channels, in_channels]
struct ggml_tensor* attn2_out_b; // [in_channels, ]
// layer norm 3
struct ggml_tensor* norm3_w; // [in_channels, ]
struct ggml_tensor* norm3_b; // [in_channels, ]
// ff
struct ggml_tensor* ff_0_proj_w; // [in_channels * 4 * 2, in_channels]
struct ggml_tensor* ff_0_proj_b; // [in_channels * 4 * 2]
struct ggml_tensor* ff_2_w; // [in_channels, in_channels * 4]
struct ggml_tensor* ff_2_b; // [in_channels,]
};
std::vector<Transformer> transformers;
struct ggml_tensor* attn_scale;
// proj_out
struct ggml_tensor* proj_out_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* proj_out_b; // [in_channels,]
SpatialTransformer(int depth = 1)
: depth(depth) {
transformers.resize(depth);
}
size_t get_num_tensors() {
return depth * 20 + 7;
}
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // norm_w/norm_b
mem_size += 2 * in_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // proj_in_w/proj_out_w
mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // proj_in_b/proj_out_b
mem_size += 1 * ggml_type_sizef(GGML_TYPE_F32); // attn_scale
// transformer
for (auto& transformer : transformers) {
mem_size += 6 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // norm1-3_w/b
mem_size += 6 * in_channels * in_channels * ggml_type_sizef(wtype); // attn1_q/k/v/out_w attn2_q/out_w
mem_size += 2 * in_channels * context_dim * ggml_type_sizef(wtype); // attn2_k/v_w
mem_size += in_channels * 4 * 2 * in_channels * ggml_type_sizef(wtype); // ff_0_proj_w
mem_size += in_channels * 4 * 2 * ggml_type_sizef(GGML_TYPE_F32); // ff_0_proj_b
mem_size += in_channels * 4 * in_channels * ggml_type_sizef(wtype); // ff_2_w
mem_size += in_channels * ggml_type_sizef(GGML_TYPE_F32); // ff_2_b
}
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
proj_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
proj_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
attn_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, attn_scale);
float scale = 1.0f / sqrt((float)d_head);
ggml_backend_tensor_set(attn_scale, &scale, 0, sizeof(scale));
// transformer
for (auto& transformer : transformers) {
transformer.norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.attn1_q_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn1_k_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn1_v_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn1_out_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn1_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.attn2_q_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn2_k_w = ggml_new_tensor_2d(ctx, wtype, context_dim, in_channels);
transformer.attn2_v_w = ggml_new_tensor_2d(ctx, wtype, context_dim, in_channels);
transformer.attn2_out_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels);
transformer.attn2_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.norm3_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.norm3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
transformer.ff_0_proj_w = ggml_new_tensor_2d(ctx, wtype, in_channels, in_channels * 4 * 2);
transformer.ff_0_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels * 4 * 2);
transformer.ff_2_w = ggml_new_tensor_2d(ctx, wtype, in_channels * 4, in_channels);
transformer.ff_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm.weight"] = norm_w;
tensors[prefix + "norm.bias"] = norm_b;
tensors[prefix + "proj_in.weight"] = proj_in_w;
tensors[prefix + "proj_in.bias"] = proj_in_b;
// transformer
for (int i = 0; i < transformers.size(); i++) {
auto& transformer = transformers[i];
std::string transformer_prefix = prefix + "transformer_blocks." + std::to_string(i) + ".";
tensors[transformer_prefix + "attn1.to_q.weight"] = transformer.attn1_q_w;
tensors[transformer_prefix + "attn1.to_k.weight"] = transformer.attn1_k_w;
tensors[transformer_prefix + "attn1.to_v.weight"] = transformer.attn1_v_w;
tensors[transformer_prefix + "attn1.to_out.0.weight"] = transformer.attn1_out_w;
tensors[transformer_prefix + "attn1.to_out.0.bias"] = transformer.attn1_out_b;
tensors[transformer_prefix + "ff.net.0.proj.weight"] = transformer.ff_0_proj_w;
tensors[transformer_prefix + "ff.net.0.proj.bias"] = transformer.ff_0_proj_b;
tensors[transformer_prefix + "ff.net.2.weight"] = transformer.ff_2_w;
tensors[transformer_prefix + "ff.net.2.bias"] = transformer.ff_2_b;
tensors[transformer_prefix + "attn2.to_q.weight"] = transformer.attn2_q_w;
tensors[transformer_prefix + "attn2.to_k.weight"] = transformer.attn2_k_w;
tensors[transformer_prefix + "attn2.to_v.weight"] = transformer.attn2_v_w;
tensors[transformer_prefix + "attn2.to_out.0.weight"] = transformer.attn2_out_w;
tensors[transformer_prefix + "attn2.to_out.0.bias"] = transformer.attn2_out_b;
tensors[transformer_prefix + "norm1.weight"] = transformer.norm1_w;
tensors[transformer_prefix + "norm1.bias"] = transformer.norm1_b;
tensors[transformer_prefix + "norm2.weight"] = transformer.norm2_w;
tensors[transformer_prefix + "norm2.bias"] = transformer.norm2_b;
tensors[transformer_prefix + "norm3.weight"] = transformer.norm3_w;
tensors[transformer_prefix + "norm3.bias"] = transformer.norm3_b;
}
tensors[prefix + "proj_out.weight"] = proj_out_w;
tensors[prefix + "proj_out.bias"] = proj_out_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* context) {
// x: [N, in_channels, h, w]
// context: [N, max_position, hidden_size(aka context_dim)]
auto x_in = x;
x = ggml_nn_group_norm(ctx, x, norm_w, norm_b);
// proj_in
x = ggml_nn_conv_2d(ctx, x, proj_in_w, proj_in_b); // [N, in_channels, h, w]
// transformer
const int64_t n = x->ne[3];
const int64_t c = x->ne[2];
const int64_t h = x->ne[1];
const int64_t w = x->ne[0];
const int64_t max_position = context->ne[1];
x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, in_channels]
for (auto& transformer : transformers) {
auto r = x;
// layer norm 1
x = ggml_reshape_2d(ctx, x, c, w * h * n);
x = ggml_nn_layer_norm(ctx, x, transformer.norm1_w, transformer.norm1_b);
// self-attention
{
x = ggml_reshape_2d(ctx, x, c, h * w * n); // [N * h * w, in_channels]
struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn1_q_w, x); // [N * h * w, in_channels]
#if !defined(SD_USE_FLASH_ATTENTION) || defined(SD_USE_CUBLAS) || defined(SD_USE_METAL)
q = ggml_scale_inplace(ctx, q, attn_scale);
#endif
q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head]
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, h * w, d_head]
q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head]
struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn1_k_w, x); // [N * h * w, in_channels]
k = ggml_reshape_4d(ctx, k, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head]
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, h * w, d_head]
k = ggml_reshape_3d(ctx, k, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head]
struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn1_v_w, x); // [N * h * w, in_channels]
v = ggml_reshape_4d(ctx, v, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head]
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, h * w]
v = ggml_reshape_3d(ctx, v, h * w, d_head, n_head * n); // [N * n_head, d_head, h * w]
#if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL)
struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false); // [N * n_head, h * w, d_head]
#else
struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, h * w, h * w]
// kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
kq = ggml_soft_max_inplace(ctx, kq);
struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, h * w, d_head]
#endif
kqv = ggml_reshape_4d(ctx, kqv, d_head, h * w, n_head, n);
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3)); // [N, h * w, n_head, d_head]
// x = ggml_cpy(ctx, kqv, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d_head * n_head, h * w * n));
x = ggml_reshape_2d(ctx, kqv, d_head * n_head, h * w * n);
x = ggml_nn_linear(ctx, x, transformer.attn1_out_w, transformer.attn1_out_b);
x = ggml_reshape_4d(ctx, x, c, w, h, n);
}
x = ggml_add(ctx, x, r);
r = x;
// layer norm 2
x = ggml_nn_layer_norm(ctx, x, transformer.norm2_w, transformer.norm2_b);
// cross-attention
{
x = ggml_reshape_2d(ctx, x, c, h * w * n); // [N * h * w, in_channels]
context = ggml_reshape_2d(ctx, context, context->ne[0], context->ne[1] * context->ne[2]); // [N * max_position, hidden_size]
struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn2_q_w, x); // [N * h * w, in_channels]
#if !defined(SD_USE_FLASH_ATTENTION) || defined(SD_USE_CUBLAS) || defined(SD_USE_METAL)
q = ggml_scale_inplace(ctx, q, attn_scale);
#endif
q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n); // [N, h * w, n_head, d_head]
q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3)); // [N, n_head, h * w, d_head]
q = ggml_reshape_3d(ctx, q, d_head, h * w, n_head * n); // [N * n_head, h * w, d_head]
struct ggml_tensor* k = ggml_mul_mat(ctx, transformer.attn2_k_w, context); // [N * max_position, in_channels]
k = ggml_reshape_4d(ctx, k, d_head, n_head, max_position, n); // [N, max_position, n_head, d_head]
k = ggml_cont(ctx, ggml_permute(ctx, k, 0, 2, 1, 3)); // [N, n_head, max_position, d_head]
k = ggml_reshape_3d(ctx, k, d_head, max_position, n_head * n); // [N * n_head, max_position, d_head]
struct ggml_tensor* v = ggml_mul_mat(ctx, transformer.attn2_v_w, context); // [N * max_position, in_channels]
v = ggml_reshape_4d(ctx, v, d_head, n_head, max_position, n); // [N, max_position, n_head, d_head]
v = ggml_cont(ctx, ggml_permute(ctx, v, 1, 2, 0, 3)); // [N, n_head, d_head, max_position]
v = ggml_reshape_3d(ctx, v, max_position, d_head, n_head * n); // [N * n_head, d_head, max_position]
#if defined(SD_USE_FLASH_ATTENTION) && !defined(SD_USE_CUBLAS) && !defined(SD_USE_METAL)
struct ggml_tensor* kqv = ggml_flash_attn(ctx, q, k, v, false); // [N * n_head, h * w, d_head]
#else
struct ggml_tensor* kq = ggml_mul_mat(ctx, k, q); // [N * n_head, h * w, max_position]
// kq = ggml_diag_mask_inf_inplace(ctx, kq, 0);
kq = ggml_soft_max_inplace(ctx, kq);
struct ggml_tensor* kqv = ggml_mul_mat(ctx, v, kq); // [N * n_head, h * w, d_head]
#endif
kqv = ggml_reshape_4d(ctx, kqv, d_head, h * w, n_head, n);
kqv = ggml_cont(ctx, ggml_permute(ctx, kqv, 0, 2, 1, 3));
// x = ggml_cpy(ctx, kqv, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, d_head * n_head, h * w * n)); // [N * h * w, in_channels]
x = ggml_reshape_2d(ctx, kqv, d_head * n_head, h * w * n); // [N * h * w, in_channels]
x = ggml_nn_linear(ctx, x, transformer.attn2_out_w, transformer.attn2_out_b);
x = ggml_reshape_4d(ctx, x, c, w, h, n);
}
x = ggml_add(ctx, x, r);
r = x;
// layer norm 3
x = ggml_reshape_2d(ctx, x, c, h * w * n); // [N * h * w, in_channels]
x = ggml_nn_layer_norm(ctx, x, transformer.norm3_w, transformer.norm3_b);
// ff
{
// GEGLU
auto x_w = ggml_view_2d(ctx,
transformer.ff_0_proj_w,
transformer.ff_0_proj_w->ne[0],
transformer.ff_0_proj_w->ne[1] / 2,
transformer.ff_0_proj_w->nb[1],
0); // [in_channels * 4, in_channels]
auto x_b = ggml_view_1d(ctx,
transformer.ff_0_proj_b,
transformer.ff_0_proj_b->ne[0] / 2,
0); // [in_channels * 4, in_channels]
auto gate_w = ggml_view_2d(ctx,
transformer.ff_0_proj_w,
transformer.ff_0_proj_w->ne[0],
transformer.ff_0_proj_w->ne[1] / 2,
transformer.ff_0_proj_w->nb[1],
transformer.ff_0_proj_w->nb[1] * transformer.ff_0_proj_w->ne[1] / 2); // [in_channels * 4, ]
auto gate_b = ggml_view_1d(ctx,
transformer.ff_0_proj_b,
transformer.ff_0_proj_b->ne[0] / 2,
transformer.ff_0_proj_b->nb[0] * transformer.ff_0_proj_b->ne[0] / 2); // [in_channels * 4, ]
x = ggml_reshape_2d(ctx, x, c, w * h * n);
auto x_in = x;
x = ggml_nn_linear(ctx, x_in, x_w, x_b); // [N * h * w, in_channels * 4]
auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b); // [N * h * w, in_channels * 4]
gate = ggml_gelu_inplace(ctx, gate);
x = ggml_mul(ctx, x, gate); // [N * h * w, in_channels * 4]
// fc
x = ggml_nn_linear(ctx, x, transformer.ff_2_w, transformer.ff_2_b); // [N * h * w, in_channels]
}
x = ggml_reshape_4d(ctx, x, c, w, h, n); // [N, h, w, in_channels]
// residual
x = ggml_add(ctx, x, r);
}
x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3)); // [N, in_channels, h, w]
// proj_out
x = ggml_nn_conv_2d(ctx, x, proj_out_w, proj_out_b); // [N, in_channels, h, w]
x = ggml_add(ctx, x, x_in);
return x;
}
};
struct DownSample {
// hparams
int channels;
int out_channels;
// conv2d params
struct ggml_tensor* op_w; // [out_channels, channels, 3, 3]
struct ggml_tensor* op_b; // [out_channels,]
bool vae_downsample = false;
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // op_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // op_b
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
op_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
op_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
if (vae_downsample) {
tensors[prefix + "conv.weight"] = op_w;
tensors[prefix + "conv.bias"] = op_b;
} else {
tensors[prefix + "op.weight"] = op_w;
tensors[prefix + "op.bias"] = op_b;
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
struct ggml_tensor* c = NULL;
if (vae_downsample) {
c = ggml_pad(ctx, x, 1, 1, 0, 0);
c = ggml_nn_conv_2d(ctx, c, op_w, op_b, 2, 2, 0, 0);
} else {
c = ggml_nn_conv_2d(ctx, x, op_w, op_b, 2, 2, 1, 1);
}
return c; // [N, out_channels, h/2, w/2]
}
};
struct UpSample {
// hparams
int channels;
int out_channels;
// conv2d params
struct ggml_tensor* conv_w; // [out_channels, channels, 3, 3]
struct ggml_tensor* conv_b; // [out_channels,]
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += out_channels * channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // op_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // op_b
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, out_channels);
conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "conv.weight"] = conv_w;
tensors[prefix + "conv.bias"] = conv_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2]
x = ggml_nn_conv_2d(ctx, x, conv_w, conv_b, 1, 1, 1, 1); // [N, out_channels, h*2, w*2]
return x;
}
};
// ldm.modules.diffusionmodules.openaimodel.UNetModel
struct UNetModel {
SDVersion version = VERSION_1_x;
// network hparams
int in_channels = 4;
int model_channels = 320;
int out_channels = 4;
int num_res_blocks = 2;
std::vector<int> attention_resolutions = {4, 2, 1};
std::vector<int> channel_mult = {1, 2, 4, 4};
std::vector<int> transformer_depth = {1, 1, 1, 1};
int time_embed_dim = 1280; // model_channels*4
int num_heads = 8;
int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_2_x, 2048 for VERSION_XL
int adm_in_channels = 2816; // only for VERSION_XL
// network params
struct ggml_tensor* time_embed_0_w; // [time_embed_dim, model_channels]
struct ggml_tensor* time_embed_0_b; // [time_embed_dim, ]
// time_embed_1 is nn.SILU()
struct ggml_tensor* time_embed_2_w; // [time_embed_dim, time_embed_dim]
struct ggml_tensor* time_embed_2_b; // [time_embed_dim, ]
struct ggml_tensor* label_embed_0_w; // [time_embed_dim, adm_in_channels]
struct ggml_tensor* label_embed_0_b; // [time_embed_dim, ]
// label_embed_1 is nn.SILU()
struct ggml_tensor* label_embed_2_w; // [time_embed_dim, time_embed_dim]
struct ggml_tensor* label_embed_2_b; // [time_embed_dim, ]
struct ggml_tensor* input_block_0_w; // [model_channels, in_channels, 3, 3]
struct ggml_tensor* input_block_0_b; // [model_channels, ]
// input_blocks
ResBlock input_res_blocks[4][2];
SpatialTransformer input_transformers[3][2];
DownSample input_down_samples[3];
// middle_block
ResBlock middle_block_0;
SpatialTransformer middle_block_1;
ResBlock middle_block_2;
// output_blocks
ResBlock output_res_blocks[4][3];
SpatialTransformer output_transformers[3][3];
UpSample output_up_samples[3];
// out
// group norm 32
struct ggml_tensor* out_0_w; // [model_channels, ]
struct ggml_tensor* out_0_b; // [model_channels, ]
// out 1 is nn.SILU()
struct ggml_tensor* out_2_w; // [out_channels, model_channels, 3, 3]
struct ggml_tensor* out_2_b; // [out_channels, ]
struct ggml_context* ctx;
ggml_backend_buffer_t params_buffer;
ggml_backend_buffer_t compute_buffer; // for compute
struct ggml_allocr* compute_alloc = NULL;
size_t compute_memory_buffer_size = -1;
size_t memory_buffer_size = 0;
ggml_type wtype;
ggml_backend_t backend = NULL;
UNetModel(SDVersion version = VERSION_1_x)
: version(version) {
if (version == VERSION_2_x) {
context_dim = 1024;
num_head_channels = 64;
num_heads = -1;
} else if (version == VERSION_XL) {
context_dim = 2048;
attention_resolutions = {4, 2};
channel_mult = {1, 2, 4};
transformer_depth = {1, 2, 10};
num_head_channels = 64;
num_heads = -1;
}
// set up hparams of blocks
// input_blocks
std::vector<int> input_block_chans;
input_block_chans.push_back(model_channels);
int ch = model_channels;
int ds = 1;
int len_mults = channel_mult.size();
for (int i = 0; i < len_mults; i++) {
int mult = channel_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
input_res_blocks[i][j].channels = ch;
input_res_blocks[i][j].emb_channels = time_embed_dim;
input_res_blocks[i][j].out_channels = mult * model_channels;
ch = mult * model_channels;
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
int n_head = num_heads;
int d_head = ch / num_heads;
if (num_head_channels != -1) {
d_head = num_head_channels;
n_head = ch / d_head;
}
input_transformers[i][j] = SpatialTransformer(transformer_depth[i]);
input_transformers[i][j].in_channels = ch;
input_transformers[i][j].n_head = n_head;
input_transformers[i][j].d_head = d_head;
input_transformers[i][j].context_dim = context_dim;
}
input_block_chans.push_back(ch);
}
if (i != len_mults - 1) {
input_down_samples[i].channels = ch;
input_down_samples[i].out_channels = ch;
input_block_chans.push_back(ch);
ds *= 2;
}
}
// middle blocks
middle_block_0.channels = ch;
middle_block_0.emb_channels = time_embed_dim;
middle_block_0.out_channels = ch;
int n_head = num_heads;
int d_head = ch / num_heads;
if (num_head_channels != -1) {
d_head = num_head_channels;
n_head = ch / d_head;
}
middle_block_1 = SpatialTransformer(transformer_depth[transformer_depth.size() - 1]);
middle_block_1.in_channels = ch;
middle_block_1.n_head = n_head;
middle_block_1.d_head = d_head;
middle_block_1.context_dim = context_dim;
middle_block_2.channels = ch;
middle_block_2.emb_channels = time_embed_dim;
middle_block_2.out_channels = ch;
// output blocks
for (int i = len_mults - 1; i >= 0; i--) {
int mult = channel_mult[i];
for (int j = 0; j < num_res_blocks + 1; j++) {
int ich = input_block_chans.back();
input_block_chans.pop_back();
output_res_blocks[i][j].channels = ch + ich;
output_res_blocks[i][j].emb_channels = time_embed_dim;
output_res_blocks[i][j].out_channels = mult * model_channels;
ch = mult * model_channels;
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
int n_head = num_heads;
int d_head = ch / num_heads;
if (num_head_channels != -1) {
d_head = num_head_channels;
n_head = ch / d_head;
}
output_transformers[i][j] = SpatialTransformer(transformer_depth[i]);
output_transformers[i][j].in_channels = ch;
output_transformers[i][j].n_head = n_head;
output_transformers[i][j].d_head = d_head;
output_transformers[i][j].context_dim = context_dim;
}
if (i > 0 && j == num_res_blocks) {
output_up_samples[i - 1].channels = ch;
output_up_samples[i - 1].out_channels = ch;
ds /= 2;
}
}
}
}
size_t calculate_mem_size() {
double mem_size = 0;
mem_size += time_embed_dim * model_channels * ggml_type_sizef(wtype); // time_embed_0_w
mem_size += time_embed_dim * ggml_type_sizef(GGML_TYPE_F32); // time_embed_0_b
mem_size += time_embed_dim * time_embed_dim * ggml_type_sizef(wtype); // time_embed_2_w
mem_size += time_embed_dim * ggml_type_sizef(GGML_TYPE_F32); // time_embed_2_b
if (version == VERSION_XL) {
mem_size += time_embed_dim * adm_in_channels * ggml_type_sizef(wtype); // label_embed_0_w
mem_size += time_embed_dim * ggml_type_sizef(GGML_TYPE_F32); // label_embed_0_b
mem_size += time_embed_dim * time_embed_dim * ggml_type_sizef(wtype); // label_embed_2_w
mem_size += time_embed_dim * ggml_type_sizef(GGML_TYPE_F32); // label_embed_2_b
}
mem_size += model_channels * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // input_block_0_w
mem_size += model_channels * ggml_type_sizef(GGML_TYPE_F32); // input_block_0_b
// input_blocks
int ds = 1;
int len_mults = channel_mult.size();
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
mem_size += input_res_blocks[i][j].calculate_mem_size(wtype);
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
mem_size += input_transformers[i][j].calculate_mem_size(wtype);
}
}
if (i != len_mults - 1) {
ds *= 2;
mem_size += input_down_samples[i].calculate_mem_size(wtype);
}
}
// middle_block
mem_size += middle_block_0.calculate_mem_size(wtype);
mem_size += middle_block_1.calculate_mem_size(wtype);
mem_size += middle_block_2.calculate_mem_size(wtype);
// output_blocks
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
mem_size += output_res_blocks[i][j].calculate_mem_size(wtype);
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
mem_size += output_transformers[i][j].calculate_mem_size(wtype);
}
if (i > 0 && j == num_res_blocks) {
mem_size += output_up_samples[i - 1].calculate_mem_size(wtype);
ds /= 2;
}
}
}
// out
mem_size += 2 * model_channels * ggml_type_sizef(GGML_TYPE_F32); // out_0_w/b
mem_size += out_channels * model_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // out_2_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // out_2_b
return static_cast<size_t>(mem_size);
}
int get_num_tensors() {
// in
int num_tensors = 6;
if (version == VERSION_XL) {
num_tensors += 4;
}
// input blocks
int ds = 1;
int len_mults = channel_mult.size();
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
num_tensors += 12;
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
num_tensors += input_transformers[i][j].get_num_tensors();
}
}
if (i != len_mults - 1) {
ds *= 2;
num_tensors += 2;
}
}
// middle blocks
num_tensors += 13 * 2;
num_tensors += middle_block_1.get_num_tensors();
// output blocks
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
num_tensors += 12;
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
num_tensors += output_transformers[i][j].get_num_tensors();
}
if (i > 0 && j == num_res_blocks) {
num_tensors += 2;
ds /= 2;
}
}
}
// out
num_tensors += 4;
return num_tensors;
}
bool initialize(ggml_backend_t backend_, ggml_type wtype_) {
backend = backend_;
wtype = wtype_;
memory_buffer_size = 10 * 1024 * 1024; // 10 MB, for padding
memory_buffer_size += calculate_mem_size();
int num_tensors = get_num_tensors();
LOG_DEBUG("unet params backend buffer size = % 6.2f MB (%i tensors)", memory_buffer_size / (1024.0 * 1024.0), num_tensors);
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(num_tensors * ggml_tensor_overhead()) + 1 * 1024 * 1024;
params.mem_buffer = NULL;
params.no_alloc = true;
// LOG_DEBUG("mem_size %u ", params.mem_size);
ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
params_buffer = ggml_backend_alloc_buffer(backend, memory_buffer_size);
return true;
}
void destroy() {
if (ctx != NULL) {
ggml_free(ctx);
ctx = NULL;
}
if (params_buffer != NULL) {
ggml_backend_buffer_free(params_buffer);
params_buffer = NULL;
}
}
void alloc_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
time_embed_0_w = ggml_new_tensor_2d(ctx, wtype, model_channels, time_embed_dim);
time_embed_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, time_embed_dim);
time_embed_2_w = ggml_new_tensor_2d(ctx, wtype, time_embed_dim, time_embed_dim);
time_embed_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, time_embed_dim);
// SDXL
if (version == VERSION_XL) {
label_embed_0_w = ggml_new_tensor_2d(ctx, wtype, adm_in_channels, time_embed_dim);
label_embed_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, time_embed_dim);
label_embed_2_w = ggml_new_tensor_2d(ctx, wtype, time_embed_dim, time_embed_dim);
label_embed_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, time_embed_dim);
}
// input_blocks
input_block_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, model_channels);
input_block_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels);
int ds = 1;
int len_mults = channel_mult.size();
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
input_res_blocks[i][j].init_params(ctx, wtype);
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
input_transformers[i][j].init_params(ctx, alloc, wtype);
}
}
if (i != len_mults - 1) {
input_down_samples[i].init_params(ctx, wtype);
ds *= 2;
}
}
// middle_blocks
middle_block_0.init_params(ctx, wtype);
middle_block_1.init_params(ctx, alloc, wtype);
middle_block_2.init_params(ctx, wtype);
// output_blocks
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
output_res_blocks[i][j].init_params(ctx, wtype);
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
output_transformers[i][j].init_params(ctx, alloc, wtype);
}
if (i > 0 && j == num_res_blocks) {
output_up_samples[i - 1].init_params(ctx, wtype);
ds /= 2;
}
}
}
// out
out_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels);
out_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model_channels);
out_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, model_channels, out_channels);
out_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
ggml_allocr_free(alloc);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "time_embed.0.weight"] = time_embed_0_w;
tensors[prefix + "time_embed.0.bias"] = time_embed_0_b;
tensors[prefix + "time_embed.2.weight"] = time_embed_2_w;
tensors[prefix + "time_embed.2.bias"] = time_embed_2_b;
if (version == VERSION_XL) {
tensors[prefix + "label_emb.0.0.weight"] = label_embed_0_w;
tensors[prefix + "label_emb.0.0.bias"] = label_embed_0_b;
tensors[prefix + "label_emb.0.2.weight"] = label_embed_2_w;
tensors[prefix + "label_emb.0.2.bias"] = label_embed_2_b;
}
// input_blocks
tensors[prefix + "input_blocks.0.0.weight"] = input_block_0_w;
tensors[prefix + "input_blocks.0.0.bias"] = input_block_0_b;
int len_mults = channel_mult.size();
int input_block_idx = 0;
int ds = 1;
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
input_block_idx += 1;
input_res_blocks[i][j].map_by_name(tensors, prefix + "input_blocks." + std::to_string(input_block_idx) + ".0.");
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
input_transformers[i][j].map_by_name(tensors, prefix + "input_blocks." + std::to_string(input_block_idx) + ".1.");
}
}
if (i != len_mults - 1) {
input_block_idx += 1;
input_down_samples[i].map_by_name(tensors, prefix + "input_blocks." + std::to_string(input_block_idx) + ".0.");
ds *= 2;
}
}
// middle_blocks
middle_block_0.map_by_name(tensors, prefix + "middle_block.0.");
middle_block_1.map_by_name(tensors, prefix + "middle_block.1.");
middle_block_2.map_by_name(tensors, prefix + "middle_block.2.");
// output_blocks
int output_block_idx = 0;
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
output_res_blocks[i][j].map_by_name(tensors, prefix + "output_blocks." + std::to_string(output_block_idx) + ".0.");
int up_sample_idx = 1;
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
output_transformers[i][j].map_by_name(tensors, prefix + "output_blocks." + std::to_string(output_block_idx) + ".1.");
up_sample_idx++;
}
if (i > 0 && j == num_res_blocks) {
output_up_samples[i - 1].map_by_name(tensors, prefix + "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx) + ".");
ds /= 2;
}
output_block_idx += 1;
}
}
// out
tensors[prefix + "out.0.weight"] = out_0_w;
tensors[prefix + "out.0.bias"] = out_0_b;
tensors[prefix + "out.2.weight"] = out_2_w;
tensors[prefix + "out.2.bias"] = out_2_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx0,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* t_emb = NULL,
struct ggml_tensor* y = NULL) {
// x: [N, in_channels, h, w]
// timesteps: [N, ]
// t_emb: [N, model_channels]
// context: [N, max_position, hidden_size]([N, 77, 768])
// y: [adm_in_channels]
if (t_emb == NULL && timesteps != NULL) {
t_emb = new_timestep_embedding(ctx0, compute_alloc, timesteps, model_channels); // [N, model_channels]
}
// time_embed = nn.Sequential
auto emb = ggml_nn_linear(ctx0, t_emb, time_embed_0_w, time_embed_0_b);
emb = ggml_silu_inplace(ctx0, emb);
emb = ggml_nn_linear(ctx0, emb, time_embed_2_w, time_embed_2_b); // [N, time_embed_dim]
// SDXL
if (y != NULL) {
auto label_emb = ggml_nn_linear(ctx0, y, label_embed_0_w, label_embed_0_b);
label_emb = ggml_silu_inplace(ctx0, label_emb);
label_emb = ggml_nn_linear(ctx0, label_emb, label_embed_2_w, label_embed_2_b);
emb = ggml_add(ctx, emb, label_emb); // [N, time_embed_dim]
}
// input_blocks
std::vector<struct ggml_tensor*> hs;
// input block 0
struct ggml_tensor* h = ggml_nn_conv_2d(ctx0, x, input_block_0_w, input_block_0_b, 1, 1, 1, 1); // [N, model_channels, h, w]
ggml_set_name(h, "bench-start");
hs.push_back(h);
// input block 1-11
int len_mults = channel_mult.size();
int ds = 1;
for (int i = 0; i < len_mults; i++) {
int mult = channel_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
h = input_res_blocks[i][j].forward(ctx0, h, emb); // [N, mult*model_channels, h, w]
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
h = input_transformers[i][j].forward(ctx0, h, context); // [N, mult*model_channels, h, w]
}
hs.push_back(h);
}
if (i != len_mults - 1) {
ds *= 2;
h = input_down_samples[i].forward(ctx0, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
hs.push_back(h);
}
}
// [N, 4*model_channels, h/8, w/8]
// middle_block
h = middle_block_0.forward(ctx0, h, emb); // [N, 4*model_channels, h/8, w/8]
h = middle_block_1.forward(ctx0, h, context); // [N, 4*model_channels, h/8, w/8]
h = middle_block_2.forward(ctx0, h, emb); // [N, 4*model_channels, h/8, w/8]
// output_blocks
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
auto h_skip = hs.back();
hs.pop_back();
h = ggml_concat(ctx0, h, h_skip);
h = output_res_blocks[i][j].forward(ctx0, h, emb);
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
h = output_transformers[i][j].forward(ctx0, h, context);
}
if (i > 0 && j == num_res_blocks) {
h = output_up_samples[i - 1].forward(ctx0, h);
ds /= 2;
}
}
}
// out
h = ggml_nn_group_norm(ctx0, h, out_0_w, out_0_b);
h = ggml_silu_inplace(ctx0, h);
// conv2d
h = ggml_nn_conv_2d(ctx0, h, out_2_w, out_2_b, 1, 1, 1, 1); // [N, out_channels, h, w]
ggml_set_name(h, "bench-end");
return h;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* t_emb = NULL,
struct ggml_tensor* y = NULL) {
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * UNET_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// LOG_DEBUG("mem_size %u ", params.mem_size);
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, UNET_GRAPH_SIZE, false);
// temporal tensors for transfer tensors from cpu to gpu if needed
struct ggml_tensor* x_t = NULL;
struct ggml_tensor* timesteps_t = NULL;
struct ggml_tensor* context_t = NULL;
struct ggml_tensor* t_emb_t = NULL;
struct ggml_tensor* y_t = NULL;
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend)) {
// pass input tensors to gpu memory
x_t = ggml_dup_tensor(ctx0, x);
context_t = ggml_dup_tensor(ctx0, context);
ggml_allocr_alloc(compute_alloc, x_t);
if (timesteps != NULL) {
timesteps_t = ggml_dup_tensor(ctx0, timesteps);
ggml_allocr_alloc(compute_alloc, timesteps_t);
}
ggml_allocr_alloc(compute_alloc, context_t);
if (t_emb != NULL) {
t_emb_t = ggml_dup_tensor(ctx0, t_emb);
ggml_allocr_alloc(compute_alloc, t_emb_t);
}
if (y != NULL) {
y_t = ggml_dup_tensor(ctx0, y);
ggml_allocr_alloc(compute_alloc, y_t);
}
// pass data to device backend
if (!ggml_allocr_is_measure(compute_alloc)) {
ggml_backend_tensor_set(x_t, x->data, 0, ggml_nbytes(x));
ggml_backend_tensor_set(context_t, context->data, 0, ggml_nbytes(context));
if (timesteps_t != NULL) {
ggml_backend_tensor_set(timesteps_t, timesteps->data, 0, ggml_nbytes(timesteps));
}
if (t_emb_t != NULL) {
ggml_backend_tensor_set(t_emb_t, t_emb->data, 0, ggml_nbytes(t_emb));
}
if (y != NULL) {
ggml_backend_tensor_set(y_t, y->data, 0, ggml_nbytes(y));
}
}
} else {
// if it's cpu backend just pass the same tensors
x_t = x;
timesteps_t = timesteps;
context_t = context;
t_emb_t = t_emb;
y_t = y;
}
struct ggml_tensor* out = forward(ctx0, x_t, timesteps_t, context_t, t_emb_t, y_t);
ggml_build_forward_expand(gf, out);
ggml_free(ctx0);
return gf;
}
void begin(struct ggml_tensor* x,
struct ggml_tensor* context,
struct ggml_tensor* t_emb = NULL,
struct ggml_tensor* y = NULL) {
if (compute_memory_buffer_size == -1) {
// alignment required by the backend
compute_alloc = ggml_allocr_new_measure_from_backend(backend);
struct ggml_cgraph* gf = build_graph(x, NULL, context, t_emb, y);
// compute the required memory
compute_memory_buffer_size = ggml_allocr_alloc_graph(compute_alloc, gf);
// recreate the allocator with the required memory
ggml_allocr_free(compute_alloc);
LOG_DEBUG("diffusion compute buffer size: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
}
compute_buffer = ggml_backend_alloc_buffer(backend, compute_memory_buffer_size);
compute_alloc = ggml_allocr_new_from_buffer(compute_buffer);
}
void compute(struct ggml_tensor* work_latent,
int n_threads,
struct ggml_tensor* x,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* t_emb = NULL,
struct ggml_tensor* y = NULL) {
ggml_allocr_reset(compute_alloc);
// compute
struct ggml_cgraph* gf = build_graph(x, timesteps, context, t_emb, y);
ggml_allocr_alloc_graph(compute_alloc, gf);
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
#ifdef SD_USE_METAL
if (ggml_backend_is_metal(backend)) {
ggml_backend_metal_set_n_cb(backend, n_threads);
}
#endif
ggml_backend_graph_compute(backend, gf);
#ifdef GGML_PERF
ggml_graph_print(gf);
#endif
ggml_backend_tensor_get_and_sync(backend, gf->nodes[gf->n_nodes - 1], work_latent->data, 0, ggml_nbytes(work_latent));
}
void end() {
ggml_allocr_free(compute_alloc);
ggml_backend_buffer_free(compute_buffer);
compute_alloc = NULL;
compute_memory_buffer_size = -1;
}
};
/*================================================== AutoEncoderKL ===================================================*/
struct ResnetBlock {
// network hparams
int in_channels;
int out_channels;
// network params
struct ggml_tensor* norm1_w; // [in_channels, ]
struct ggml_tensor* norm1_b; // [in_channels, ]
struct ggml_tensor* conv1_w; // [out_channels, in_channels, 3, 3]
struct ggml_tensor* conv1_b; // [out_channels, ]
struct ggml_tensor* norm2_w; // [out_channels, ]
struct ggml_tensor* norm2_b; // [out_channels, ]
struct ggml_tensor* conv2_w; // [out_channels, out_channels, 3, 3]
struct ggml_tensor* conv2_b; // [out_channels, ]
// nin_shortcut, only if out_channels != in_channels
struct ggml_tensor* nin_shortcut_w; // [out_channels, in_channels, 1, 1]
struct ggml_tensor* nin_shortcut_b; // [out_channels, ]
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // norm1_w/b
mem_size += out_channels * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv1_w
mem_size += 4 * out_channels * ggml_type_sizef(GGML_TYPE_F32); // conv1_b/norm2_w/norm2_b/conv2_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv2_w
if (out_channels != in_channels) {
mem_size += out_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // nin_shortcut_w
mem_size += out_channels * ggml_type_sizef(GGML_TYPE_F32); // nin_shortcut_b
}
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_type wtype) {
norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
norm1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
conv1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, out_channels);
conv1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
norm2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
norm2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
conv2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
conv2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
if (out_channels != in_channels) {
nin_shortcut_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, out_channels);
nin_shortcut_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm1.weight"] = norm1_w;
tensors[prefix + "norm1.bias"] = norm1_b;
tensors[prefix + "conv1.weight"] = conv1_w;
tensors[prefix + "conv1.bias"] = conv1_b;
tensors[prefix + "norm2.weight"] = norm2_w;
tensors[prefix + "norm2.bias"] = norm2_b;
tensors[prefix + "conv2.weight"] = conv2_w;
tensors[prefix + "conv2.bias"] = conv2_b;
if (out_channels != in_channels) {
tensors[prefix + "nin_shortcut.weight"] = nin_shortcut_w;
tensors[prefix + "nin_shortcut.bias"] = nin_shortcut_b;
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
// z: [N, in_channels, h, w]
auto h = ggml_nn_group_norm(ctx, z, norm1_w, norm1_b);
h = ggml_silu_inplace(ctx, h);
h = ggml_nn_conv_2d(ctx, h, conv1_w, conv1_b, 1, 1, 1, 1); // [N, out_channels, h, w]
h = ggml_nn_group_norm(ctx, h, norm2_w, norm2_b);
h = ggml_silu_inplace(ctx, h);
// dropout, skip for inference
h = ggml_nn_conv_2d(ctx, h, conv2_w, conv2_b, 1, 1, 1, 1); // [N, out_channels, h, w]
// skip connection
if (out_channels != in_channels) {
z = ggml_nn_conv_2d(ctx, z, nin_shortcut_w, nin_shortcut_b); // [N, out_channels, h, w]
}
h = ggml_add(ctx, h, z);
return h; // [N, out_channels, h, w]
}
};
struct AttnBlock {
int in_channels; // mult * model_channels
// group norm
struct ggml_tensor* norm_w; // [in_channels,]
struct ggml_tensor* norm_b; // [in_channels,]
// q/k/v
struct ggml_tensor* q_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* q_b; // [in_channels,]
struct ggml_tensor* k_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* k_b; // [in_channels,]
struct ggml_tensor* v_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* v_b; // [in_channels,]
// proj_out
struct ggml_tensor* proj_out_w; // [in_channels, in_channels, 1, 1]
struct ggml_tensor* proj_out_b; // [in_channels,]
struct ggml_tensor* attn_scale;
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
mem_size += 6 * in_channels * ggml_type_sizef(GGML_TYPE_F32); // norm_w/norm_b/q_b/k_v/v_b/proj_out_b
mem_size += 4 * in_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // q_w/k_w/v_w/proj_out_w // object overhead
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
q_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
k_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
v_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
attn_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, attn_scale);
float scale = 1.0f / sqrt((float)in_channels);
ggml_backend_tensor_set(attn_scale, &scale, 0, sizeof(scale));
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm.weight"] = norm_w;
tensors[prefix + "norm.bias"] = norm_b;
tensors[prefix + "q.weight"] = q_w;
tensors[prefix + "q.bias"] = q_b;
tensors[prefix + "k.weight"] = k_w;
tensors[prefix + "k.bias"] = k_b;
tensors[prefix + "v.weight"] = v_w;
tensors[prefix + "v.bias"] = v_b;
tensors[prefix + "proj_out.weight"] = proj_out_w;
tensors[prefix + "proj_out.bias"] = proj_out_b;
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, in_channels, h, w]
auto h_ = ggml_nn_group_norm(ctx, x, norm_w, norm_b);
const int64_t n = h_->ne[3];
const int64_t c = h_->ne[2];
const int64_t h = h_->ne[1];
const int64_t w = h_->ne[0];
auto q = ggml_nn_conv_2d(ctx, h_, q_w, q_b); // [N, in_channels, h, w]
auto k = ggml_nn_conv_2d(ctx, h_, k_w, k_b); // [N, in_channels, h, w]
auto v = ggml_nn_conv_2d(ctx, h_, v_w, v_b); // [N, in_channels, h, w]
q = ggml_cont(ctx, ggml_permute(ctx, q, 1, 2, 0, 3)); // [N, h, w, in_channels]
q = ggml_reshape_3d(ctx, q, c, h * w, n); // [N, h * w, in_channels]
k = ggml_cont(ctx, ggml_permute(ctx, k, 1, 2, 0, 3)); // [N, h, w, in_channels]
k = ggml_reshape_3d(ctx, k, c, h * w, n); // [N, h * w, in_channels]
auto w_ = ggml_mul_mat(ctx, k, q); // [N, h * w, h * w]
w_ = ggml_scale_inplace(ctx, w_, attn_scale);
w_ = ggml_soft_max_inplace(ctx, w_);
v = ggml_reshape_3d(ctx, v, h * w, c, n); // [N, in_channels, h * w]
h_ = ggml_mul_mat(ctx, v, w_); // [N, h * w, in_channels]
h_ = ggml_cont(ctx, ggml_permute(ctx, h_, 1, 0, 2, 3)); // [N, in_channels, h * w]
h_ = ggml_reshape_4d(ctx, h_, w, h, c, n); // [N, in_channels, h, w]
// proj_out
h_ = ggml_nn_conv_2d(ctx, h_, proj_out_w, proj_out_b); // [N, in_channels, h, w]
h_ = ggml_add(ctx, h_, x);
return h_;
}
};
// ldm.modules.diffusionmodules.model.Encoder
struct Encoder {
int embed_dim = 4;
int ch = 128;
int z_channels = 4;
int in_channels = 3;
int num_res_blocks = 2;
int ch_mult[4] = {1, 2, 4, 4};
struct ggml_tensor* conv_in_w; // [ch, in_channels, 3, 3]
struct ggml_tensor* conv_in_b; // [ch, ]
ResnetBlock down_blocks[4][2];
DownSample down_samples[3];
struct
{
ResnetBlock block_1;
AttnBlock attn_1;
ResnetBlock block_2;
} mid;
// block_in = ch * ch_mult[len_mults - 1]
struct ggml_tensor* norm_out_w; // [block_in, ]
struct ggml_tensor* norm_out_b; // [block_in, ]
struct ggml_tensor* conv_out_w; // [embed_dim*2, block_in, 3, 3]
struct ggml_tensor* conv_out_b; // [embed_dim*2, ]
Encoder() {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = 1;
for (int i = 0; i < len_mults; i++) {
if (i == 0) {
block_in = ch;
} else {
block_in = ch * ch_mult[i - 1];
}
int block_out = ch * ch_mult[i];
for (int j = 0; j < num_res_blocks; j++) {
down_blocks[i][j].in_channels = block_in;
down_blocks[i][j].out_channels = block_out;
block_in = block_out;
}
if (i != len_mults - 1) {
down_samples[i].channels = block_in;
down_samples[i].out_channels = block_in;
down_samples[i].vae_downsample = true;
}
}
mid.block_1.in_channels = block_in;
mid.block_1.out_channels = block_in;
mid.attn_1.in_channels = block_in;
mid.block_2.in_channels = block_in;
mid.block_2.out_channels = block_in;
}
size_t get_num_tensors() {
int num_tensors = 6;
// mid
num_tensors += 10 * 3;
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
num_tensors += 10;
}
if (i != 0) {
num_tensors += 2;
}
}
return num_tensors;
}
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
mem_size += ch * in_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_in_w
mem_size += ch * ggml_type_sizef(GGML_TYPE_F32); // conv_in_b
mem_size += 2 * block_in * ggml_type_sizef(GGML_TYPE_F32); // norm_out_w/b
mem_size += z_channels * 2 * block_in * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_out_w
mem_size += z_channels * 2 * ggml_type_sizef(GGML_TYPE_F32); // conv_out_b
mem_size += mid.block_1.calculate_mem_size(wtype);
mem_size += mid.attn_1.calculate_mem_size(wtype);
mem_size += mid.block_2.calculate_mem_size(wtype);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
mem_size += down_blocks[i][j].calculate_mem_size(wtype);
}
if (i != 0) {
mem_size += down_samples[i - 1].calculate_mem_size(wtype);
}
}
return static_cast<size_t>(mem_size);
}
void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, ch);
conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch);
norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
conv_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, block_in, z_channels * 2);
conv_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_channels * 2);
mid.block_1.init_params(ctx, wtype);
mid.attn_1.init_params(ctx, alloc, wtype);
mid.block_2.init_params(ctx, wtype);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
down_blocks[i][j].init_params(ctx, wtype);
}
if (i != len_mults - 1) {
down_samples[i].init_params(ctx, wtype);
}
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm_out.weight"] = norm_out_w;
tensors[prefix + "norm_out.bias"] = norm_out_b;
tensors[prefix + "conv_in.weight"] = conv_in_w;
tensors[prefix + "conv_in.bias"] = conv_in_b;
tensors[prefix + "conv_out.weight"] = conv_out_w;
tensors[prefix + "conv_out.bias"] = conv_out_b;
mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
mid.block_2.map_by_name(tensors, prefix + "mid.block_2.");
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
down_blocks[i][j].map_by_name(tensors, prefix + "down." + std::to_string(i) + ".block." + std::to_string(j) + ".");
}
if (i != len_mults - 1) {
down_samples[i].map_by_name(tensors, prefix + "down." + std::to_string(i) + ".downsample.");
}
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
// x: [N, in_channels, h, w]
// conv_in
auto h = ggml_nn_conv_2d(ctx, x, conv_in_w, conv_in_b, 1, 1, 1, 1); // [N, ch, h, w]
ggml_set_name(h, "b-start");
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
for (int j = 0; j < num_res_blocks; j++) {
h = down_blocks[i][j].forward(ctx, h);
}
if (i != len_mults - 1) {
h = down_samples[i].forward(ctx, h);
}
}
h = mid.block_1.forward(ctx, h);
h = mid.attn_1.forward(ctx, h);
h = mid.block_2.forward(ctx, h); // [N, block_in, h, w]
h = ggml_nn_group_norm(ctx, h, norm_out_w, norm_out_b);
h = ggml_silu_inplace(ctx, h);
// conv_out
h = ggml_nn_conv_2d(ctx, h, conv_out_w, conv_out_b, 1, 1, 1, 1); // [N, z_channels*2, h, w]
return h;
}
};
// ldm.modules.diffusionmodules.model.Decoder
struct Decoder {
int embed_dim = 4;
int ch = 128;
int z_channels = 4;
int out_ch = 3;
int num_res_blocks = 2;
int ch_mult[4] = {1, 2, 4, 4};
// block_in = ch * ch_mult[-1], 512
struct ggml_tensor* conv_in_w; // [block_in, z_channels, 3, 3]
struct ggml_tensor* conv_in_b; // [block_in, ]
struct
{
ResnetBlock block_1;
AttnBlock attn_1;
ResnetBlock block_2;
} mid;
ResnetBlock up_blocks[4][3];
UpSample up_samples[3];
struct ggml_tensor* norm_out_w; // [ch * ch_mult[0], ]
struct ggml_tensor* norm_out_b; // [ch * ch_mult[0], ]
struct ggml_tensor* conv_out_w; // [out_ch, ch * ch_mult[0], 3, 3]
struct ggml_tensor* conv_out_b; // [out_ch, ]
Decoder() {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
mid.block_1.in_channels = block_in;
mid.block_1.out_channels = block_in;
mid.attn_1.in_channels = block_in;
mid.block_2.in_channels = block_in;
mid.block_2.out_channels = block_in;
for (int i = len_mults - 1; i >= 0; i--) {
int mult = ch_mult[i];
int block_out = ch * mult;
for (int j = 0; j < num_res_blocks + 1; j++) {
up_blocks[i][j].in_channels = block_in;
up_blocks[i][j].out_channels = block_out;
block_in = block_out;
}
if (i != 0) {
up_samples[i - 1].channels = block_in;
up_samples[i - 1].out_channels = block_in;
}
}
}
size_t calculate_mem_size(ggml_type wtype) {
double mem_size = 0;
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
mem_size += block_in * z_channels * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_in_w
mem_size += block_in * ggml_type_sizef(GGML_TYPE_F32); // conv_in_b
mem_size += 2 * (ch * ch_mult[0]) * ggml_type_sizef(GGML_TYPE_F32); // norm_out_w/b
mem_size += (ch * ch_mult[0]) * out_ch * 3 * 3 * ggml_type_sizef(GGML_TYPE_F16); // conv_out_w
mem_size += out_ch * ggml_type_sizef(GGML_TYPE_F32); // conv_out_b
mem_size += mid.block_1.calculate_mem_size(wtype);
mem_size += mid.attn_1.calculate_mem_size(wtype);
mem_size += mid.block_2.calculate_mem_size(wtype);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
mem_size += up_blocks[i][j].calculate_mem_size(wtype);
}
if (i != 0) {
mem_size += up_samples[i - 1].calculate_mem_size(wtype);
}
}
return static_cast<size_t>(mem_size);
}
size_t get_num_tensors() {
int num_tensors = 8;
// mid
num_tensors += 10 * 3;
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
num_tensors += 10;
}
if (i != 0) {
num_tensors += 2;
}
}
return num_tensors;
}
void init_params(struct ggml_context* ctx, ggml_allocr* alloc, ggml_type wtype) {
int len_mults = sizeof(ch_mult) / sizeof(int);
int block_in = ch * ch_mult[len_mults - 1];
norm_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
norm_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ch * ch_mult[0]);
conv_in_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, z_channels, block_in);
conv_in_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, block_in);
conv_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, ch * ch_mult[0], out_ch);
conv_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_ch);
mid.block_1.init_params(ctx, wtype);
mid.attn_1.init_params(ctx, alloc, wtype);
mid.block_2.init_params(ctx, wtype);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
up_blocks[i][j].init_params(ctx, wtype);
}
if (i != 0) {
up_samples[i - 1].init_params(ctx, wtype);
}
}
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
tensors[prefix + "norm_out.weight"] = norm_out_w;
tensors[prefix + "norm_out.bias"] = norm_out_b;
tensors[prefix + "conv_in.weight"] = conv_in_w;
tensors[prefix + "conv_in.bias"] = conv_in_b;
tensors[prefix + "conv_out.weight"] = conv_out_w;
tensors[prefix + "conv_out.bias"] = conv_out_b;
mid.block_1.map_by_name(tensors, prefix + "mid.block_1.");
mid.attn_1.map_by_name(tensors, prefix + "mid.attn_1.");
mid.block_2.map_by_name(tensors, prefix + "mid.block_2.");
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
up_blocks[i][j].map_by_name(tensors, prefix + "up." + std::to_string(i) + ".block." + std::to_string(j) + ".");
}
if (i != 0) {
up_samples[i - 1].map_by_name(tensors, prefix + "up." + std::to_string(i) + ".upsample.");
}
}
}
struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* z) {
// z: [N, z_channels, h, w]
// conv_in
auto h = ggml_nn_conv_2d(ctx, z, conv_in_w, conv_in_b, 1, 1, 1, 1); // [N, block_in, h, w]
h = mid.block_1.forward(ctx, h);
h = mid.attn_1.forward(ctx, h);
h = mid.block_2.forward(ctx, h); // [N, block_in, h, w]
int len_mults = sizeof(ch_mult) / sizeof(int);
for (int i = len_mults - 1; i >= 0; i--) {
for (int j = 0; j < num_res_blocks + 1; j++) {
h = up_blocks[i][j].forward(ctx, h);
}
if (i != 0) {
h = up_samples[i - 1].forward(ctx, h);
}
}
// group norm 32
h = ggml_nn_group_norm(ctx, h, norm_out_w, norm_out_b);
h = ggml_silu_inplace(ctx, h);
// conv_out
h = ggml_nn_conv_2d(ctx, h, conv_out_w, conv_out_b, 1, 1, 1, 1); // [N, out_ch, h, w]
return h;
}
};
// ldm.models.autoencoder.AutoencoderKL
struct AutoEncoderKL {
bool decode_only = true;
int embed_dim = 4;
struct
{
int z_channels = 4;
int resolution = 256;
int in_channels = 3;
int out_ch = 3;
int ch = 128;
int ch_mult[4] = {1, 2, 4, 4};
int num_res_blocks = 2;
} dd_config;
struct ggml_tensor* quant_conv_w; // [2*embed_dim, 2*z_channels, 1, 1]
struct ggml_tensor* quant_conv_b; // [2*embed_dim, ]
struct ggml_tensor* post_quant_conv_w; // [z_channels, embed_dim, 1, 1]
struct ggml_tensor* post_quant_conv_b; // [z_channels, ]
Encoder encoder;
Decoder decoder;
struct ggml_context* ctx = NULL;
ggml_backend_buffer_t params_buffer = NULL;
ggml_backend_buffer_t compute_buffer = NULL; // for compute
struct ggml_allocr* compute_alloc = NULL;
int memory_buffer_size = 0;
ggml_type wtype;
ggml_backend_t backend = NULL;
AutoEncoderKL(bool decode_only = false)
: decode_only(decode_only) {
assert(sizeof(dd_config.ch_mult) == sizeof(encoder.ch_mult));
assert(sizeof(dd_config.ch_mult) == sizeof(decoder.ch_mult));
encoder.embed_dim = embed_dim;
decoder.embed_dim = embed_dim;
encoder.ch = dd_config.ch;
decoder.ch = dd_config.ch;
encoder.z_channels = dd_config.z_channels;
decoder.z_channels = dd_config.z_channels;
encoder.in_channels = dd_config.in_channels;
decoder.out_ch = dd_config.out_ch;
encoder.num_res_blocks = dd_config.num_res_blocks;
int len_mults = sizeof(dd_config.ch_mult) / sizeof(int);
for (int i = 0; i < len_mults; i++) {
encoder.ch_mult[i] = dd_config.ch_mult[i];
decoder.ch_mult[i] = dd_config.ch_mult[i];
}
}
size_t calculate_mem_size() {
double mem_size = 0;
if (!decode_only) {
mem_size += 2 * embed_dim * 2 * dd_config.z_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // quant_conv_w
mem_size += 2 * embed_dim * ggml_type_sizef(GGML_TYPE_F32); // quant_conv_b
mem_size += encoder.calculate_mem_size(wtype);
}
mem_size += dd_config.z_channels * embed_dim * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16); // post_quant_conv_w
mem_size += dd_config.z_channels * ggml_type_sizef(GGML_TYPE_F32); // post_quant_conv_b
mem_size += decoder.calculate_mem_size(wtype);
return static_cast<size_t>(mem_size);
}
bool initialize(ggml_backend_t backend_, ggml_type wtype_) {
backend = backend_;
wtype = wtype_;
memory_buffer_size = 1 * 1024 * 1024; // 1 MB, for padding
memory_buffer_size += (int)calculate_mem_size();
int num_tensors = 0;
if (!decode_only) {
num_tensors += 2;
num_tensors += (int)encoder.get_num_tensors();
}
num_tensors += (int)decoder.get_num_tensors();
LOG_DEBUG("vae params backend buffer size = % 6.2f MB (%i tensors)", memory_buffer_size / (1024.0 * 1024.0), num_tensors);
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(num_tensors * ggml_tensor_overhead());
params.mem_buffer = NULL;
params.no_alloc = true;
// LOG_DEBUG("mem_size %u ", params.mem_size);
params_buffer = ggml_backend_alloc_buffer(backend, memory_buffer_size);
ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
return true;
}
void destroy() {
if (ctx != NULL) {
ggml_free(ctx);
ctx = NULL;
}
}
void alloc_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
if (!decode_only) {
quant_conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, 2 * dd_config.z_channels, 2 * embed_dim);
quant_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2 * embed_dim);
encoder.init_params(ctx, alloc, wtype);
}
post_quant_conv_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, embed_dim, dd_config.z_channels);
post_quant_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, dd_config.z_channels);
decoder.init_params(ctx, alloc, wtype);
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
ggml_allocr_free(alloc);
}
void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
if (!decode_only) {
tensors[prefix + "quant_conv.weight"] = quant_conv_w;
tensors[prefix + "quant_conv.bias"] = quant_conv_b;
encoder.map_by_name(tensors, prefix + "encoder.");
}
tensors[prefix + "post_quant_conv.weight"] = post_quant_conv_w;
tensors[prefix + "post_quant_conv.bias"] = post_quant_conv_b;
decoder.map_by_name(tensors, prefix + "decoder.");
}
struct ggml_tensor* decode(struct ggml_context* ctx0, struct ggml_tensor* z) {
// z: [N, z_channels, h, w]
// post_quant_conv
auto h = ggml_nn_conv_2d(ctx0, z, post_quant_conv_w, post_quant_conv_b); // [N, z_channels, h, w]
ggml_set_name(h, "bench-start");
h = decoder.forward(ctx0, h);
ggml_set_name(h, "bench-end");
return h;
}
struct ggml_tensor* encode(struct ggml_context* ctx0, struct ggml_tensor* x) {
// x: [N, in_channels, h, w]
auto h = encoder.forward(ctx0, x); // [N, 2*z_channels, h/8, w/8]
// quant_conv
h = ggml_nn_conv_2d(ctx0, h, quant_conv_w, quant_conv_b); // [N, 2*embed_dim, h/8, w/8]
ggml_set_name(h, "b-end");
return h;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * UNET_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// LOG_DEBUG("mem_size %u ", params.mem_size);
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph(ctx0);
struct ggml_tensor* z_ = NULL;
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend)) {
// pass input tensors to gpu memory
z_ = ggml_dup_tensor(ctx0, z);
ggml_allocr_alloc(compute_alloc, z_);
// pass data to device backend
if (!ggml_allocr_is_measure(compute_alloc)) {
ggml_backend_tensor_set(z_, z->data, 0, ggml_nbytes(z));
}
} else {
z_ = z;
}
struct ggml_tensor* out = decode_graph ? decode(ctx0, z_) : encode(ctx0, z_);
ggml_build_forward_expand(gf, out);
ggml_free(ctx0);
return gf;
}
void begin(struct ggml_tensor* x, bool decode) {
// calculate the amount of memory required
// alignment required by the backend
compute_alloc = ggml_allocr_new_measure_from_backend(backend);
struct ggml_cgraph* gf = build_graph(x, decode);
// compute the required memory
size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(compute_alloc, gf);
// recreate the allocator with the required memory
ggml_allocr_free(compute_alloc);
LOG_DEBUG("vae compute buffer size: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
compute_buffer = ggml_backend_alloc_buffer(backend, compute_memory_buffer_size);
compute_alloc = ggml_allocr_new_from_buffer(compute_buffer);
}
void compute(struct ggml_tensor* work_result, const int n_threads, struct ggml_tensor* z, bool decode_graph) {
ggml_allocr_reset(compute_alloc);
struct ggml_cgraph* gf = build_graph(z, decode_graph);
ggml_allocr_alloc_graph(compute_alloc, gf);
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
#ifdef SD_USE_METAL
if (ggml_backend_is_metal(backend)) {
ggml_backend_metal_set_n_cb(backend, n_threads);
}
#endif
ggml_backend_graph_compute(backend, gf);
#ifdef GGML_PERF
ggml_graph_print(gf);
#endif
ggml_backend_tensor_get_and_sync(backend, gf->nodes[gf->n_nodes - 1], work_result->data, 0, ggml_nbytes(work_result));
}
void end() {
ggml_allocr_free(compute_alloc);
ggml_backend_buffer_free(compute_buffer);
compute_alloc = NULL;
}
};
/*
=================================== TinyAutoEncoder ===================================
References:
https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/autoencoder_tiny.py
https://github.com/madebyollin/taesd/blob/main/taesd.py
*/
struct TAEBlock {
int in_channels;
int out_channels;
// conv
ggml_tensor* conv_0_w; // [in_channels, out_channels, 3, 3]
ggml_tensor* conv_0_b; // [in_channels]
ggml_tensor* conv_1_w; // [out_channels, out_channels, 3, 3]
ggml_tensor* conv_1_b; // [out_channels]
ggml_tensor* conv_2_w; // [out_channels, out_channels, 3, 3]
ggml_tensor* conv_2_b; // [out_channels]
// skip
ggml_tensor* conv_skip_w; // [in_channels, out_channels, 1, 1]
size_t calculate_mem_size() {
size_t mem_size = in_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_0_w
mem_size += in_channels * ggml_type_size(GGML_TYPE_F32); // conv_0_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_1_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_1_b
mem_size += out_channels * out_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_2_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_2_b
if (in_channels != out_channels) {
mem_size += in_channels * out_channels * ggml_type_size(GGML_TYPE_F16); // conv_skip_w
}
return mem_size;
}
int get_num_tensors() {
return 6 + (in_channels != out_channels ? 1 : 0);
}
void init_params(ggml_context* ctx) {
conv_0_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, in_channels);
conv_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
conv_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, out_channels, out_channels);
conv_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
if (in_channels != out_channels) {
conv_skip_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, out_channels, in_channels);
}
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "conv.0.weight"] = conv_0_w;
tensors[prefix + "conv.0.bias"] = conv_0_b;
tensors[prefix + "conv.2.weight"] = conv_1_w;
tensors[prefix + "conv.2.bias"] = conv_1_b;
tensors[prefix + "conv.4.weight"] = conv_2_w;
tensors[prefix + "conv.4.bias"] = conv_2_b;
if (in_channels != out_channels) {
tensors[prefix + "skip.weight"] = conv_skip_w;
}
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* x) {
// conv(n_in, n_out)
ggml_tensor* h;
h = ggml_nn_conv_2d(ctx, x, conv_0_w, conv_0_b, 1, 1, 1, 1);
h = ggml_relu_inplace(ctx, h);
h = ggml_nn_conv_2d(ctx, h, conv_1_w, conv_1_b, 1, 1, 1, 1);
h = ggml_relu_inplace(ctx, h);
h = ggml_nn_conv_2d(ctx, h, conv_2_w, conv_2_b, 1, 1, 1, 1);
// skip connection
if (in_channels != out_channels) {
// skip = nn.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
x = ggml_nn_conv_2d(ctx, x, conv_skip_w, NULL, 1, 1, 1, 1);
}
h = ggml_add(ctx, h, x);
h = ggml_relu_inplace(ctx, h);
return h;
}
};
struct TinyEncoder {
int in_channels = 3;
int z_channels = 4;
int channels = 64;
int num_blocks = 3;
// input
ggml_tensor* conv_input_w; // [channels, in_channels, 3, 3]
ggml_tensor* conv_input_b; // [channels]
TAEBlock initial_block;
ggml_tensor* conv_1_w; // [channels, channels, 3, 3]
TAEBlock input_blocks[3];
// middle
ggml_tensor* conv_2_w; // [channels, channels, 3, 3]
TAEBlock middle_blocks[3];
// output
ggml_tensor* conv_3_w; // [channels, channels, 3, 3]
TAEBlock output_blocks[3];
// final
ggml_tensor* conv_final_w; // [z_channels, channels, 3, 3]
ggml_tensor* conv_final_b; // [z_channels]
TinyEncoder() {
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].in_channels = channels;
input_blocks[i].out_channels = channels;
middle_blocks[i].in_channels = channels;
middle_blocks[i].out_channels = channels;
output_blocks[i].in_channels = channels;
output_blocks[i].out_channels = channels;
}
initial_block.in_channels = channels;
initial_block.out_channels = channels;
}
size_t calculate_mem_size() {
size_t mem_size = channels * in_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
mem_size += initial_block.calculate_mem_size();
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_2_w
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_3_w
for (int i = 0; i < num_blocks; i++) {
mem_size += input_blocks[i].calculate_mem_size();
mem_size += middle_blocks[i].calculate_mem_size();
mem_size += output_blocks[i].calculate_mem_size();
}
mem_size += z_channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += z_channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
return mem_size;
}
int get_num_tensors() {
int num_tensors = 7;
for (int i = 0; i < num_blocks; i++) {
num_tensors += input_blocks[i].get_num_tensors();
num_tensors += middle_blocks[i].get_num_tensors();
num_tensors += output_blocks[i].get_num_tensors();
}
num_tensors += initial_block.get_num_tensors();
return num_tensors;
}
void init_params(ggml_context* ctx) {
conv_input_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, channels);
conv_input_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
initial_block.init_params(ctx);
conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_final_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, z_channels);
conv_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, z_channels);
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].init_params(ctx);
middle_blocks[i].init_params(ctx);
output_blocks[i].init_params(ctx);
}
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "0.weight"] = conv_input_w;
tensors[prefix + "0.bias"] = conv_input_b;
initial_block.map_by_name(tensors, prefix + "1.");
tensors[prefix + "2.weight"] = conv_1_w;
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 3) + ".");
}
tensors[prefix + "6.weight"] = conv_2_w;
for (int i = 0; i < num_blocks; i++) {
middle_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 7) + ".");
}
tensors[prefix + "10.weight"] = conv_3_w;
for (int i = 0; i < num_blocks; i++) {
output_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 11) + ".");
}
tensors[prefix + "14.weight"] = conv_final_w;
tensors[prefix + "14.bias"] = conv_final_b;
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* x) {
// conv(3, 64)
auto z = ggml_nn_conv_2d(ctx, x, conv_input_w, conv_input_b, 1, 1, 1, 1);
// Block(64, 64)
z = initial_block.forward(ctx, z);
// conv(64, 64, stride=2, bias=False)
z = ggml_nn_conv_2d(ctx, z, conv_1_w, NULL, 2, 2, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
z = input_blocks[i].forward(ctx, z);
}
// conv(64, 64, stride=2, bias=False)
z = ggml_nn_conv_2d(ctx, z, conv_2_w, NULL, 2, 2, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
z = middle_blocks[i].forward(ctx, z);
}
// conv(64, 64, stride=2, bias=False)
z = ggml_nn_conv_2d(ctx, z, conv_3_w, NULL, 2, 2, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
z = output_blocks[i].forward(ctx, z);
}
// conv(64, 4)
z = ggml_nn_conv_2d(ctx, z, conv_final_w, conv_final_b, 1, 1, 1, 1);
return z;
}
};
struct TinyDecoder {
int z_channels = 4;
int channels = 64;
int output_channels = 3;
int num_blocks = 3;
// input
ggml_tensor* conv_input_w; // [channels, z_channels, 3, 3]
ggml_tensor* conv_input_b; // [channels]
TAEBlock input_blocks[3];
ggml_tensor* conv_1_w; // [channels, channels, 3, 3]
// middle
TAEBlock middle_blocks[3];
ggml_tensor* conv_2_w; // [channels, channels, 3, 3]
// output
TAEBlock output_blocks[3];
ggml_tensor* conv_3_w; // [channels, channels, 3, 3]
// final
TAEBlock final_block;
ggml_tensor* conv_final_w; // [output_channels, channels, 3, 3]
ggml_tensor* conv_final_b; // [output_channels]
ggml_tensor* in_scale_1d3; // [1]
ggml_tensor* in_scale_3; // [1]
TinyDecoder() {
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].in_channels = channels;
input_blocks[i].out_channels = channels;
middle_blocks[i].in_channels = channels;
middle_blocks[i].out_channels = channels;
output_blocks[i].in_channels = channels;
output_blocks[i].out_channels = channels;
}
final_block.in_channels = channels;
final_block.out_channels = channels;
}
size_t calculate_mem_size() {
size_t mem_size = channels * z_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
for (int i = 0; i < num_blocks; i++) {
mem_size += input_blocks[i].calculate_mem_size();
}
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_1_w
for (int i = 0; i < num_blocks; i++) {
mem_size += middle_blocks[i].calculate_mem_size();
}
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_2_w
for (int i = 0; i < num_blocks; i++) {
mem_size += output_blocks[i].calculate_mem_size();
}
mem_size += channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_3_w
mem_size += final_block.calculate_mem_size();
mem_size += output_channels * channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_input_w
mem_size += output_channels * ggml_type_size(GGML_TYPE_F32); // conv_input_b
return mem_size;
}
int get_num_tensors() {
int num_tensors = 9;
for (int i = 0; i < num_blocks; i++) {
num_tensors += input_blocks[i].get_num_tensors();
num_tensors += middle_blocks[i].get_num_tensors();
num_tensors += output_blocks[i].get_num_tensors();
}
num_tensors += final_block.get_num_tensors();
return num_tensors;
}
void init_params(ggml_allocr* alloc, ggml_context* ctx) {
conv_input_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, z_channels, channels);
conv_input_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, channels);
conv_1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, channels);
conv_final_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, channels, output_channels);
conv_final_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, output_channels);
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].init_params(ctx);
middle_blocks[i].init_params(ctx);
output_blocks[i].init_params(ctx);
}
final_block.init_params(ctx);
// initialize constants scales
in_scale_1d3 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
in_scale_3 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
ggml_allocr_alloc(alloc, in_scale_1d3);
float scale_1d3 = 1.0f / 3.0f;
ggml_backend_tensor_set(in_scale_1d3, &scale_1d3, 0, sizeof(scale_1d3));
ggml_allocr_alloc(alloc, in_scale_3);
float scale_3 = 3.0f;
ggml_backend_tensor_set(in_scale_3, &scale_3, 0, sizeof(scale_3));
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "0.weight"] = conv_input_w;
tensors[prefix + "0.bias"] = conv_input_b;
for (int i = 0; i < num_blocks; i++) {
input_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 2) + ".");
}
tensors[prefix + "6.weight"] = conv_1_w;
for (int i = 0; i < num_blocks; i++) {
middle_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 7) + ".");
}
tensors[prefix + "11.weight"] = conv_2_w;
for (int i = 0; i < num_blocks; i++) {
output_blocks[i].map_by_name(tensors, prefix + std::to_string(i + 12) + ".");
}
tensors[prefix + "16.weight"] = conv_3_w;
final_block.map_by_name(tensors, prefix + "17.");
tensors[prefix + "18.weight"] = conv_final_w;
tensors[prefix + "18.bias"] = conv_final_b;
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* z) {
// torch.tanh(x / 3) * 3
auto h = ggml_scale(ctx, z, in_scale_1d3);
h = ggml_tanh_inplace(ctx, h);
h = ggml_scale(ctx, h, in_scale_3);
// conv(4, 64)
h = ggml_nn_conv_2d(ctx, h, conv_input_w, conv_input_b, 1, 1, 1, 1);
// nn.ReLU()
h = ggml_relu_inplace(ctx, h);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
h = input_blocks[i].forward(ctx, h);
}
// nn.Upsample(scale_factor=2)
h = ggml_upscale(ctx, h, 2);
// conv(64, 64, bias=False)
h = ggml_nn_conv_2d(ctx, h, conv_1_w, NULL, 1, 1, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
h = middle_blocks[i].forward(ctx, h);
}
// nn.Upsample(scale_factor=2)
h = ggml_upscale(ctx, h, 2);
// conv(64, 64, bias=False)
h = ggml_nn_conv_2d(ctx, h, conv_2_w, NULL, 1, 1, 1, 1);
// Block(64, 64), Block(64, 64), Block(64, 64)
for (int i = 0; i < num_blocks; i++) {
h = output_blocks[i].forward(ctx, h);
}
// nn.Upsample(scale_factor=2)
h = ggml_upscale(ctx, h, 2);
// conv(64, 64, bias=False)
h = ggml_nn_conv_2d(ctx, h, conv_3_w, NULL, 1, 1, 1, 1);
// Block(64, 64)
h = final_block.forward(ctx, h);
// conv(64, 3)
h = ggml_nn_conv_2d(ctx, h, conv_final_w, conv_final_b, 1, 1, 1, 1);
return h;
}
};
struct TinyAutoEncoder {
TinyEncoder encoder;
TinyDecoder decoder;
ggml_context* ctx = NULL;
bool decode_only = false;
ggml_backend_buffer_t params_buffer = NULL;
ggml_backend_buffer_t compute_buffer = NULL; // for compute
struct ggml_allocr* compute_alloc = NULL;
int memory_buffer_size = 0;
ggml_type wtype;
ggml_backend_t backend = NULL;
TinyAutoEncoder(bool decoder_only_ = true)
: decode_only(decoder_only_) {
decoder = TinyDecoder();
if (!decoder_only_) {
encoder = TinyEncoder();
}
}
size_t calculate_mem_size() {
size_t mem_size = decoder.calculate_mem_size();
if (!decode_only) {
mem_size += encoder.calculate_mem_size();
}
mem_size += 1024; // padding
return mem_size;
}
bool init(ggml_backend_t backend_) {
backend = backend_;
memory_buffer_size = calculate_mem_size();
int num_tensors = decoder.get_num_tensors();
if (!decode_only) {
num_tensors += encoder.get_num_tensors();
}
LOG_DEBUG("TAE params backend buffer size = % 6.2f MB (%i tensors)", memory_buffer_size / (1024.0 * 1024.0), num_tensors);
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(num_tensors * ggml_tensor_overhead());
params.mem_buffer = NULL;
params.no_alloc = true;
// LOG_DEBUG("mem_size %u ", params.mem_size);
params_buffer = ggml_backend_alloc_buffer(backend, memory_buffer_size);
ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
return true;
}
void alloc_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
decoder.init_params(alloc, ctx);
if (!decode_only) {
encoder.init_params(ctx);
}
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
ggml_allocr_free(alloc);
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors) {
decoder.map_by_name(tensors, "decoder.layers.");
if (!decode_only) {
encoder.map_by_name(tensors, "encoder.layers.");
}
}
bool load_from_file(const std::string& file_path, ggml_backend_t backend) {
LOG_INFO("loading taesd from '%s'", file_path.c_str());
if (!init(backend)) {
return false;
}
std::map<std::string, ggml_tensor*> taesd_tensors;
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init taesd model loader from file failed: '%s'", file_path.c_str());
return false;
}
// prepare memory for the weights
{
alloc_params();
map_by_name(taesd_tensors);
}
std::set<std::string> tensor_names_in_file;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
tensor_names_in_file.insert(name);
struct ggml_tensor* real;
if (taesd_tensors.find(name) != taesd_tensors.end()) {
real = taesd_tensors[name];
} else {
if (name.find("encoder.") != std::string::npos && decode_only) {
return true;
}
LOG_ERROR("unknown tensor '%s' in model file", name.data());
return true;
}
if (
real->ne[0] != tensor_storage.ne[0] ||
real->ne[1] != tensor_storage.ne[1] ||
real->ne[2] != tensor_storage.ne[2] ||
real->ne[3] != tensor_storage.ne[3]) {
LOG_ERROR(
"tensor '%s' has wrong shape in model file: "
"got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
name.c_str(),
(int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
(int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]);
return false;
}
*dst_tensor = real;
return true;
};
bool success = model_loader.load_tensors(on_new_tensor_cb, backend);
bool some_tensor_not_init = false;
for (auto pair : taesd_tensors) {
if (tensor_names_in_file.find(pair.first) == tensor_names_in_file.end()) {
LOG_ERROR("tensor '%s' not in model file", pair.first.c_str());
some_tensor_not_init = true;
}
}
if (some_tensor_not_init) {
return false;
}
LOG_INFO("taesd model loaded");
return success;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// LOG_DEBUG("mem_size %u ", params.mem_size);
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph(ctx0);
struct ggml_tensor* z_ = NULL;
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend)) {
// pass input tensors to gpu memory
z_ = ggml_dup_tensor(ctx0, z);
ggml_allocr_alloc(compute_alloc, z_);
// pass data to device backend
if (!ggml_allocr_is_measure(compute_alloc)) {
ggml_backend_tensor_set(z_, z->data, 0, ggml_nbytes(z));
}
} else {
z_ = z;
}
struct ggml_tensor* out = decode_graph ? decoder.forward(ctx0, z_) : encoder.forward(ctx0, z_);
ggml_build_forward_expand(gf, out);
ggml_free(ctx0);
return gf;
}
void begin(struct ggml_tensor* x, bool decode) {
// calculate the amount of memory required
// alignment required by the backend
compute_alloc = ggml_allocr_new_measure_from_backend(backend);
struct ggml_cgraph* gf = build_graph(x, decode);
// compute the required memory
size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(compute_alloc, gf);
// recreate the allocator with the required memory
ggml_allocr_free(compute_alloc);
LOG_DEBUG("TAE compute buffer size: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
compute_buffer = ggml_backend_alloc_buffer(backend, compute_memory_buffer_size);
compute_alloc = ggml_allocr_new_from_buffer(compute_buffer);
}
void compute(struct ggml_tensor* work_result, const int n_threads, struct ggml_tensor* z, bool decode_graph) {
ggml_allocr_reset(compute_alloc);
struct ggml_cgraph* gf = build_graph(z, decode_graph);
ggml_allocr_alloc_graph(compute_alloc, gf);
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
#ifdef SD_USE_METAL
if (ggml_backend_is_metal(backend)) {
ggml_backend_metal_set_n_cb(backend, n_threads);
}
#endif
ggml_backend_graph_compute(backend, gf);
#ifdef GGML_PERF
ggml_graph_print(gf);
#endif
ggml_backend_tensor_get_and_sync(backend, gf->nodes[gf->n_nodes - 1], work_result->data, 0, ggml_nbytes(work_result));
}
void end() {
ggml_allocr_free(compute_alloc);
ggml_backend_buffer_free(compute_buffer);
compute_alloc = NULL;
}
};
/*
=================================== ESRGAN ===================================
References:
https://github.com/xinntao/Real-ESRGAN/blob/master/inference_realesrgan.py
https://github.com/XPixelGroup/BasicSR/blob/v1.4.2/basicsr/archs/rrdbnet_arch.py
*/
struct ResidualDenseBlock {
int num_features;
int num_grow_ch;
ggml_tensor* conv1_w; // [num_grow_ch, num_features, 3, 3]
ggml_tensor* conv1_b; // [num_grow_ch]
ggml_tensor* conv2_w; // [num_grow_ch, num_features + num_grow_ch, 3, 3]
ggml_tensor* conv2_b; // [num_grow_ch]
ggml_tensor* conv3_w; // [num_grow_ch, num_features + 2 * num_grow_ch, 3, 3]
ggml_tensor* conv3_b; // [num_grow_ch]
ggml_tensor* conv4_w; // [num_grow_ch, num_features + 3 * num_grow_ch, 3, 3]
ggml_tensor* conv4_b; // [num_grow_ch]
ggml_tensor* conv5_w; // [num_features, num_features + 4 * num_grow_ch, 3, 3]
ggml_tensor* conv5_b; // [num_features]
ResidualDenseBlock() {}
ResidualDenseBlock(int num_feat, int n_grow_ch) {
num_features = num_feat;
num_grow_ch = n_grow_ch;
}
size_t calculate_mem_size() {
size_t mem_size = num_features * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv1_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv1_b
mem_size += (num_features + num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv2_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv2_b
mem_size += (num_features + 2 * num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv3_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv3_w
mem_size += (num_features + 3 * num_grow_ch) * num_grow_ch * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv4_w
mem_size += num_grow_ch * ggml_type_size(GGML_TYPE_F32); // conv4_w
mem_size += (num_features + 4 * num_grow_ch) * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv5_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv5_w
return mem_size;
}
int get_num_tensors() {
int num_tensors = 10;
return num_tensors;
}
void init_params(ggml_context* ctx) {
conv1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features, num_grow_ch);
conv1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + num_grow_ch, num_grow_ch);
conv2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv3_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 2 * num_grow_ch, num_grow_ch);
conv3_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv4_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 3 * num_grow_ch, num_grow_ch);
conv4_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_grow_ch);
conv5_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features + 4 * num_grow_ch, num_features);
conv5_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_features);
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
tensors[prefix + "conv1.weight"] = conv1_w;
tensors[prefix + "conv1.bias"] = conv1_b;
tensors[prefix + "conv2.weight"] = conv2_w;
tensors[prefix + "conv2.bias"] = conv2_b;
tensors[prefix + "conv3.weight"] = conv3_w;
tensors[prefix + "conv3.bias"] = conv3_b;
tensors[prefix + "conv4.weight"] = conv4_w;
tensors[prefix + "conv4.bias"] = conv4_b;
tensors[prefix + "conv5.weight"] = conv5_w;
tensors[prefix + "conv5.bias"] = conv5_b;
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
// x1 = self.lrelu(self.conv1(x))
ggml_tensor* x1 = ggml_nn_conv_2d(ctx, x, conv1_w, conv1_b, 1, 1, 1, 1);
x1 = ggml_leaky_relu(ctx, x1, 0.2f, true);
// x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
ggml_tensor* x_cat = ggml_concat(ctx, x, x1);
ggml_tensor* x2 = ggml_nn_conv_2d(ctx, x_cat, conv2_w, conv2_b, 1, 1, 1, 1);
x2 = ggml_leaky_relu(ctx, x2, 0.2f, true);
// x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
x_cat = ggml_concat(ctx, x_cat, x2);
ggml_tensor* x3 = ggml_nn_conv_2d(ctx, x_cat, conv3_w, conv3_b, 1, 1, 1, 1);
x3 = ggml_leaky_relu(ctx, x3, 0.2f, true);
// x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
x_cat = ggml_concat(ctx, x_cat, x3);
ggml_tensor* x4 = ggml_nn_conv_2d(ctx, x_cat, conv4_w, conv4_b, 1, 1, 1, 1);
x4 = ggml_leaky_relu(ctx, x4, 0.2f, true);
// self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
x_cat = ggml_concat(ctx, x_cat, x4);
ggml_tensor* x5 = ggml_nn_conv_2d(ctx, x_cat, conv5_w, conv5_b, 1, 1, 1, 1);
// return x5 * 0.2 + x
x5 = ggml_add(ctx, ggml_scale(ctx, x5, out_scale), x);
return x5;
}
};
struct EsrganBlock {
ResidualDenseBlock rd_blocks[3];
int num_residual_blocks = 3;
EsrganBlock() {}
EsrganBlock(int num_feat, int num_grow_ch) {
for (int i = 0; i < num_residual_blocks; i++) {
rd_blocks[i] = ResidualDenseBlock(num_feat, num_grow_ch);
}
}
int get_num_tensors() {
int num_tensors = 0;
for (int i = 0; i < num_residual_blocks; i++) {
num_tensors += rd_blocks[i].get_num_tensors();
}
return num_tensors;
}
size_t calculate_mem_size() {
size_t mem_size = 0;
for (int i = 0; i < num_residual_blocks; i++) {
mem_size += rd_blocks[i].calculate_mem_size();
}
return mem_size;
}
void init_params(ggml_context* ctx) {
for (int i = 0; i < num_residual_blocks; i++) {
rd_blocks[i].init_params(ctx);
}
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
for (int i = 0; i < num_residual_blocks; i++) {
rd_blocks[i].map_by_name(tensors, prefix + "rdb" + std::to_string(i + 1) + ".");
}
}
ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x) {
ggml_tensor* out = x;
for (int i = 0; i < num_residual_blocks; i++) {
// out = self.rdb...(x)
out = rd_blocks[i].forward(ctx, out_scale, out);
}
// return out * 0.2 + x
out = ggml_add(ctx, ggml_scale(ctx, out, out_scale), x);
return out;
}
};
struct ESRGAN {
int scale = 4; // default RealESRGAN_x4plus_anime_6B
int num_blocks = 6; // default RealESRGAN_x4plus_anime_6B
int in_channels = 3;
int out_channels = 3;
int num_features = 64; // default RealESRGAN_x4plus_anime_6B
int num_grow_ch = 32; // default RealESRGAN_x4plus_anime_6B
int tile_size = 128; // avoid cuda OOM for 4gb VRAM
ggml_tensor* conv_first_w; // [num_features, in_channels, 3, 3]
ggml_tensor* conv_first_b; // [num_features]
EsrganBlock body_blocks[6];
ggml_tensor* conv_body_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_body_b; // [num_features]
// upsample
ggml_tensor* conv_up1_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_up1_b; // [num_features]
ggml_tensor* conv_up2_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_up2_b; // [num_features]
ggml_tensor* conv_hr_w; // [num_features, num_features, 3, 3]
ggml_tensor* conv_hr_b; // [num_features]
ggml_tensor* conv_last_w; // [out_channels, num_features, 3, 3]
ggml_tensor* conv_last_b; // [out_channels]
ggml_context* ctx = NULL;
bool decode_only = false;
ggml_backend_buffer_t params_buffer = NULL;
ggml_backend_buffer_t compute_buffer = NULL; // for compute
struct ggml_allocr* compute_alloc = NULL;
int memory_buffer_size = 0;
ggml_type wtype;
ggml_backend_t backend = NULL;
ESRGAN() {
for (int i = 0; i < num_blocks; i++) {
body_blocks[i] = EsrganBlock(num_features, num_grow_ch);
}
}
size_t calculate_mem_size() {
size_t mem_size = num_features * in_channels * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_first_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_first_b
for (int i = 0; i < num_blocks; i++) {
mem_size += body_blocks[i].calculate_mem_size();
}
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_body_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_body_w
// upsample
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_up1_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_up1_b
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_up2_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_up2_b
mem_size += num_features * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_hr_w
mem_size += num_features * ggml_type_size(GGML_TYPE_F32); // conv_hr_b
mem_size += out_channels * num_features * 3 * 3 * ggml_type_size(GGML_TYPE_F16); // conv_last_w
mem_size += out_channels * ggml_type_size(GGML_TYPE_F32); // conv_last_b
return mem_size;
}
int get_num_tensors() {
int num_tensors = 12;
for (int i = 0; i < num_blocks; i++) {
num_tensors += body_blocks[i].get_num_tensors();
}
return num_tensors;
}
bool init(ggml_backend_t backend_) {
this->backend = backend_;
memory_buffer_size = calculate_mem_size();
memory_buffer_size += 1024; // overhead
int num_tensors = get_num_tensors();
LOG_DEBUG("ESRGAN params backend buffer size = % 6.2f MB (%i tensors)", memory_buffer_size / (1024.0 * 1024.0), num_tensors);
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(num_tensors * ggml_tensor_overhead());
params.mem_buffer = NULL;
params.no_alloc = true;
params_buffer = ggml_backend_alloc_buffer(backend, memory_buffer_size);
ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
return true;
}
void alloc_params() {
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer);
conv_first_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, in_channels, num_features);
conv_first_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_features);
conv_body_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_body_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_features);
conv_up1_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_up1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_features);
conv_up2_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_up2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_features);
conv_hr_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features, num_features);
conv_hr_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, num_features);
conv_last_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 3, 3, num_features, out_channels);
conv_last_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, out_channels);
for (int i = 0; i < num_blocks; i++) {
body_blocks[i].init_params(ctx);
}
// alloc all tensors linked to this context
for (struct ggml_tensor* t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
ggml_allocr_alloc(alloc, t);
}
}
ggml_allocr_free(alloc);
}
bool load_from_file(const std::string& file_path, ggml_backend_t backend) {
LOG_INFO("loading esrgan from '%s'", file_path.c_str());
if (!init(backend)) {
return false;
}
std::map<std::string, ggml_tensor*> esrgan_tensors;
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init esrgan model loader from file failed: '%s'", file_path.c_str());
return false;
}
// prepare memory for the weights
{
alloc_params();
map_by_name(esrgan_tensors);
}
std::set<std::string> tensor_names_in_file;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
tensor_names_in_file.insert(name);
struct ggml_tensor* real;
if (esrgan_tensors.find(name) != esrgan_tensors.end()) {
real = esrgan_tensors[name];
} else {
LOG_ERROR("unknown tensor '%s' in model file", name.data());
return true;
}
if (
real->ne[0] != tensor_storage.ne[0] ||
real->ne[1] != tensor_storage.ne[1] ||
real->ne[2] != tensor_storage.ne[2] ||
real->ne[3] != tensor_storage.ne[3]) {
LOG_ERROR(
"tensor '%s' has wrong shape in model file: "
"got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
name.c_str(),
(int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
(int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]);
return false;
}
*dst_tensor = real;
return true;
};
bool success = model_loader.load_tensors(on_new_tensor_cb, backend);
bool some_tensor_not_init = false;
for (auto pair : esrgan_tensors) {
if (tensor_names_in_file.find(pair.first) == tensor_names_in_file.end()) {
LOG_ERROR("tensor '%s' not in model file", pair.first.c_str());
some_tensor_not_init = true;
}
}
if (some_tensor_not_init) {
return false;
}
LOG_INFO("esrgan model loaded");
return success;
}
void map_by_name(std::map<std::string, ggml_tensor*>& tensors) {
tensors["conv_first.weight"] = conv_first_w;
tensors["conv_first.bias"] = conv_first_b;
for (int i = 0; i < num_blocks; i++) {
body_blocks[i].map_by_name(tensors, "body." + std::to_string(i) + ".");
}
tensors["conv_body.weight"] = conv_body_w;
tensors["conv_body.bias"] = conv_body_b;
tensors["conv_up1.weight"] = conv_up1_w;
tensors["conv_up1.bias"] = conv_up1_b;
tensors["conv_up2.weight"] = conv_up2_w;
tensors["conv_up2.bias"] = conv_up2_b;
tensors["conv_hr.weight"] = conv_hr_w;
tensors["conv_hr.bias"] = conv_hr_b;
tensors["conv_last.weight"] = conv_last_w;
tensors["conv_last.bias"] = conv_last_b;
}
ggml_tensor* forward(ggml_context* ctx0, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
// feat = self.conv_first(feat)
auto h = ggml_nn_conv_2d(ctx0, x, conv_first_w, conv_first_b, 1, 1, 1, 1);
auto body_h = h;
// self.body(feat)
for (int i = 0; i < num_blocks; i++) {
body_h = body_blocks[i].forward(ctx0, out_scale, body_h);
}
// body_feat = self.conv_body(self.body(feat))
body_h = ggml_nn_conv_2d(ctx0, body_h, conv_body_w, conv_body_b, 1, 1, 1, 1);
// feat = feat + body_feat
h = ggml_add(ctx0, h, body_h);
// upsample
// feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
h = ggml_upscale(ctx0, h, 2);
h = ggml_nn_conv_2d(ctx0, h, conv_up1_w, conv_up1_b, 1, 1, 1, 1);
h = ggml_leaky_relu(ctx0, h, 0.2f, true);
// feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
h = ggml_upscale(ctx0, h, 2);
h = ggml_nn_conv_2d(ctx0, h, conv_up2_w, conv_up2_b, 1, 1, 1, 1);
h = ggml_leaky_relu(ctx0, h, 0.2f, true);
// out = self.conv_last(self.lrelu(self.conv_hr(feat)))
h = ggml_nn_conv_2d(ctx0, h, conv_hr_w, conv_hr_b, 1, 1, 1, 1);
h = ggml_leaky_relu(ctx0, h, 0.2f, true);
h = ggml_nn_conv_2d(ctx0, h, conv_last_w, conv_last_b, 1, 1, 1, 1);
return h;
}
struct ggml_cgraph* build_graph(struct ggml_tensor* x) {
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph(ctx0);
struct ggml_tensor* x_ = NULL;
struct ggml_tensor* os = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(compute_alloc, os);
if (!ggml_allocr_is_measure(compute_alloc)) {
float scale = 0.2f;
ggml_backend_tensor_set(os, &scale, 0, sizeof(scale));
}
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend)) {
// pass input tensors to gpu memory
x_ = ggml_dup_tensor(ctx0, x);
ggml_allocr_alloc(compute_alloc, x_);
// pass data to device backend
if (!ggml_allocr_is_measure(compute_alloc)) {
ggml_backend_tensor_set(x_, x->data, 0, ggml_nbytes(x));
}
} else {
x_ = x;
}
struct ggml_tensor* out = forward(ctx0, os, x);
ggml_build_forward_expand(gf, out);
ggml_free(ctx0);
return gf;
}
void begin(struct ggml_tensor* x) {
// calculate the amount of memory required
// alignment required by the backend
compute_alloc = ggml_allocr_new_measure_from_backend(backend);
struct ggml_cgraph* gf = build_graph(x);
// compute the required memory
size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(compute_alloc, gf);
// recreate the allocator with the required memory
ggml_allocr_free(compute_alloc);
LOG_DEBUG("ESRGAN compute buffer size: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
compute_buffer = ggml_backend_alloc_buffer(backend, compute_memory_buffer_size);
compute_alloc = ggml_allocr_new_from_buffer(compute_buffer);
}
void compute(struct ggml_tensor* work_result, const int n_threads, struct ggml_tensor* x) {
ggml_allocr_reset(compute_alloc);
struct ggml_cgraph* gf = build_graph(x);
ggml_allocr_alloc_graph(compute_alloc, gf);
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
#ifdef SD_USE_METAL
if (ggml_backend_is_metal(backend)) {
ggml_backend_metal_set_n_cb(backend, n_threads);
}
#endif
ggml_backend_graph_compute(backend, gf);
#ifdef GGML_PERF
ggml_graph_print(gf);
#endif
ggml_tensor* out = gf->nodes[gf->n_nodes - 1];
ggml_backend_tensor_get_and_sync(backend, out, work_result->data, 0, ggml_nbytes(out));
}
void end() {
ggml_allocr_free(compute_alloc);
ggml_backend_buffer_free(compute_buffer);
compute_alloc = NULL;
}
};
float ggml_backend_tensor_get_f32(ggml_tensor* tensor) {
GGML_ASSERT(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16);
float value;
if (tensor->type == GGML_TYPE_F32) {
ggml_backend_tensor_get(tensor, &value, 0, sizeof(value));
} else { // GGML_TYPE_F16
ggml_fp16_t f16_value;
ggml_backend_tensor_get(tensor, &f16_value, 0, sizeof(f16_value));
value = ggml_fp16_to_fp32(f16_value);
}
return value;
}
struct LoraModel {
float multiplier = 1.0f;
std::map<std::string, struct ggml_tensor*> lora_tensors;
struct ggml_context* ctx = NULL;
ggml_backend_buffer_t params_buffer_lora = NULL;
ggml_backend_t backend = NULL;
bool load(ggml_backend_t backend_, std::string file_path) {
backend = backend_;
LOG_INFO("loading LoRA from '%s'", file_path.c_str());
ModelLoader model_loader;
if (!model_loader.init_from_file(file_path)) {
LOG_ERROR("init lora model loader from file failed: '%s'", file_path.c_str());
return false;
}
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(LORA_GRAPH_SIZE * ggml_tensor_overhead());
params.mem_buffer = NULL;
params.no_alloc = true;
// LOG_DEBUG("mem_size %u ", params.mem_size);
ctx = ggml_init(params);
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
ggml_type wtype = model_loader.get_sd_wtype();
LOG_DEBUG("calculating buffer size");
int64_t memory_buffer_size = model_loader.cal_mem_size(backend);
LOG_DEBUG("lora params backend buffer size = % 6.2f MB", memory_buffer_size / (1024.0 * 1024.0));
params_buffer_lora = ggml_backend_alloc_buffer(backend, memory_buffer_size);
ggml_allocr* alloc = ggml_allocr_new_from_buffer(params_buffer_lora);
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
struct ggml_tensor* real = ggml_new_tensor(ctx, tensor_storage.type, tensor_storage.n_dims, tensor_storage.ne);
ggml_allocr_alloc(alloc, real);
*dst_tensor = real;
lora_tensors[name] = real;
return true;
};
model_loader.load_tensors(on_new_tensor_cb, backend);
LOG_DEBUG("finished loaded lora");
ggml_allocr_free(alloc);
return true;
}
struct ggml_cgraph* build_graph(struct ggml_allocr* compute_alloc, std::map<std::string, struct ggml_tensor*> model_tensors) {
// make a graph to compute all lora, expected lora and models tensors are in the same backend
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead() * LORA_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params = {
/*.mem_size =*/buf_size,
/*.mem_buffer =*/buf.data(),
/*.no_alloc =*/true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
};
// LOG_DEBUG("mem_size %u ", params.mem_size);
struct ggml_context* ctx0 = ggml_init(params);
struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, LORA_GRAPH_SIZE, false);
std::set<std::string> applied_lora_tensors;
for (auto it : model_tensors) {
std::string k_tensor = it.first;
struct ggml_tensor* weight = model_tensors[it.first];
size_t k_pos = k_tensor.find(".weight");
if (k_pos == std::string::npos) {
continue;
}
k_tensor = k_tensor.substr(0, k_pos);
replace_all_chars(k_tensor, '.', '_');
std::string lora_up_name = "lora." + k_tensor + ".lora_up.weight";
std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
std::string alpha_name = "lora." + k_tensor + ".alpha";
std::string scale_name = "lora." + k_tensor + ".scale";
ggml_tensor* lora_up = NULL;
ggml_tensor* lora_down = NULL;
if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
lora_up = lora_tensors[lora_up_name];
}
if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
lora_down = lora_tensors[lora_down_name];
}
if (lora_up == NULL || lora_down == NULL) {
continue;
}
applied_lora_tensors.insert(lora_up_name);
applied_lora_tensors.insert(lora_down_name);
applied_lora_tensors.insert(alpha_name);
applied_lora_tensors.insert(scale_name);
// calc_cale
int64_t dim = lora_down->ne[lora_down->n_dims - 1];
float scale_value = 1.0f;
if (lora_tensors.find(scale_name) != lora_tensors.end()) {
scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
} else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
scale_value = alpha / dim;
}
scale_value *= multiplier;
ggml_tensor* lora_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(compute_alloc, lora_scale);
if (!ggml_allocr_is_measure(compute_alloc)) {
ggml_backend_tensor_set(lora_scale, &scale_value, 0, ggml_nbytes(lora_scale));
}
// flat lora tensors to multiply it
int64_t lora_up_rows = lora_up->ne[lora_up->n_dims - 1];
lora_up = ggml_reshape_2d(ctx0, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
int64_t lora_down_rows = lora_down->ne[lora_down->n_dims - 1];
lora_down = ggml_reshape_2d(ctx0, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
// ggml_mul_mat requires tensor b transposed
lora_down = ggml_cont(ctx0, ggml_transpose(ctx0, lora_down));
struct ggml_tensor* updown = ggml_mul_mat(ctx0, lora_up, lora_down);
updown = ggml_cont(ctx0, ggml_transpose(ctx0, updown));
updown = ggml_reshape(ctx0, updown, weight);
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
updown = ggml_scale_inplace(ctx0, updown, lora_scale);
ggml_tensor* final_weight;
// if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
// final_weight = ggml_new_tensor(ctx0, GGML_TYPE_F32, weight->n_dims, weight->ne);
// final_weight = ggml_cpy_inplace(ctx0, weight, final_weight);
// final_weight = ggml_add_inplace(ctx0, final_weight, updown);
// final_weight = ggml_cpy_inplace(ctx0, final_weight, weight);
// } else {
// final_weight = ggml_add_inplace(ctx0, weight, updown);
// }
final_weight = ggml_add_inplace(ctx0, weight, updown); // apply directly
ggml_build_forward_expand(gf, final_weight);
}
for (auto& kv : lora_tensors) {
if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
LOG_WARN("unused lora tensor %s", kv.first.c_str());
}
}
return gf;
}
void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
struct ggml_allocr* compute_alloc = NULL;
ggml_backend_buffer_t buffer_compute_lora = NULL;
// compute the required memory
{
compute_alloc = ggml_allocr_new_measure_from_backend(backend);
struct ggml_cgraph* gf = build_graph(compute_alloc, model_tensors);
size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(compute_alloc, gf);
// recreate the allocator with the required memory
ggml_allocr_free(compute_alloc);
LOG_DEBUG("apply lora buffer size: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
buffer_compute_lora = ggml_backend_alloc_buffer(backend, compute_memory_buffer_size);
compute_alloc = ggml_allocr_new_from_buffer(buffer_compute_lora);
}
ggml_allocr_reset(compute_alloc);
struct ggml_cgraph* gf = build_graph(compute_alloc, model_tensors);
ggml_allocr_alloc_graph(compute_alloc, gf);
if (ggml_backend_is_cpu(backend)) {
ggml_backend_cpu_set_n_threads(backend, n_threads);
}
#ifdef SD_USE_METAL
if (ggml_backend_is_metal(backend)) {
ggml_backend_metal_set_n_cb(backend, n_threads);
}
#endif
ggml_backend_graph_compute(backend, gf);
ggml_allocr_free(compute_alloc);
ggml_backend_buffer_free(buffer_compute_lora);
compute_alloc = NULL;
}
void release() {
if (ctx != NULL) {
ggml_free(ctx);
ctx = NULL;
}
if (params_buffer_lora != NULL) {
ggml_backend_buffer_free(params_buffer_lora);
params_buffer_lora = NULL;
}
}
};
/*================================================= CompVisDenoiser ==================================================*/
// Ref: https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
struct SigmaSchedule {
float alphas_cumprod[TIMESTEPS];
float sigmas[TIMESTEPS];
float log_sigmas[TIMESTEPS];
virtual std::vector<float> get_sigmas(uint32_t n) = 0;
float sigma_to_t(float sigma) {
float log_sigma = std::log(sigma);
std::vector<float> dists;
dists.reserve(TIMESTEPS);
for (float log_sigma_val : log_sigmas) {
dists.push_back(log_sigma - log_sigma_val);
}
int low_idx = 0;
for (size_t i = 0; i < TIMESTEPS; i++) {
if (dists[i] >= 0) {
low_idx++;
}
}
low_idx = std::min(std::max(low_idx - 1, 0), TIMESTEPS - 2);
int high_idx = low_idx + 1;
float low = log_sigmas[low_idx];
float high = log_sigmas[high_idx];
float w = (low - log_sigma) / (low - high);
w = std::max(0.f, std::min(1.f, w));
float t = (1.0f - w) * low_idx + w * high_idx;
return t;
}
float t_to_sigma(float t) {
int low_idx = static_cast<int>(std::floor(t));
int high_idx = static_cast<int>(std::ceil(t));
float w = t - static_cast<float>(low_idx);
float log_sigma = (1.0f - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx];
return std::exp(log_sigma);
}
};
struct DiscreteSchedule : SigmaSchedule {
std::vector<float> get_sigmas(uint32_t n) {
std::vector<float> result;
int t_max = TIMESTEPS - 1;
if (n == 0) {
return result;
} else if (n == 1) {
result.push_back(t_to_sigma((float)t_max));
result.push_back(0);
return result;
}
float step = static_cast<float>(t_max) / static_cast<float>(n - 1);
for (uint32_t i = 0; i < n; ++i) {
float t = t_max - step * i;
result.push_back(t_to_sigma(t));
}
result.push_back(0);
return result;
}
};
struct KarrasSchedule : SigmaSchedule {
std::vector<float> get_sigmas(uint32_t n) {
// These *COULD* be function arguments here,
// but does anybody ever bother to touch them?
float sigma_min = 0.1f;
float sigma_max = 10.f;
float rho = 7.f;
std::vector<float> result(n + 1);
float min_inv_rho = pow(sigma_min, (1.f / rho));
float max_inv_rho = pow(sigma_max, (1.f / rho));
for (uint32_t i = 0; i < n; i++) {
// Eq. (5) from Karras et al 2022
result[i] = pow(max_inv_rho + (float)i / ((float)n - 1.f) * (min_inv_rho - max_inv_rho), rho);
}
result[n] = 0.;
return result;
}
};
struct Denoiser {
std::shared_ptr<SigmaSchedule> schedule = std::make_shared<DiscreteSchedule>();
virtual std::vector<float> get_scalings(float sigma) = 0;
};
struct CompVisDenoiser : public Denoiser {
float sigma_data = 1.0f;
std::vector<float> get_scalings(float sigma) {
float c_out = -sigma;
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
return {c_out, c_in};
}
};
struct CompVisVDenoiser : public Denoiser {
float sigma_data = 1.0f;
std::vector<float> get_scalings(float sigma) {
float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
return {c_skip, c_out, c_in};
}
};
/*=============================================== StableDiffusionGGML ================================================*/
class StableDiffusionGGML {
public:
SDVersion version;
bool vae_decode_only = false;
bool free_params_immediately = false;
std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
int n_threads = -1;
float scale_factor = 0.18215f;
FrozenCLIPEmbedderWithCustomWords cond_stage_model;
UNetModel diffusion_model;
AutoEncoderKL first_stage_model;
bool use_tiny_autoencoder = false;
bool vae_tiling = false;
std::map<std::string, struct ggml_tensor*> tensors;
std::string lora_model_dir;
// lora_name => multiplier
std::unordered_map<std::string, float> curr_lora_state;
std::map<std::string, LoraModel> loras;
std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
ggml_backend_t backend = NULL; // general backend
ggml_type model_data_type = GGML_TYPE_COUNT;
TinyAutoEncoder tae_first_stage;
std::string taesd_path;
ESRGAN esrgan_upscaler;
std::string esrgan_path;
bool upscale_output = false;
StableDiffusionGGML() = default;
StableDiffusionGGML(int n_threads,
bool vae_decode_only,
bool free_params_immediately,
std::string lora_model_dir,
RNGType rng_type)
: n_threads(n_threads),
vae_decode_only(vae_decode_only),
free_params_immediately(free_params_immediately),
lora_model_dir(lora_model_dir) {
first_stage_model.decode_only = vae_decode_only;
tae_first_stage.decode_only = vae_decode_only;
if (rng_type == STD_DEFAULT_RNG) {
rng = std::make_shared<STDDefaultRNG>();
} else if (rng_type == CUDA_RNG) {
rng = std::make_shared<PhiloxRNG>();
}
this->lora_model_dir = lora_model_dir;
}
~StableDiffusionGGML() {
cond_stage_model.destroy();
diffusion_model.destroy();
if (!use_tiny_autoencoder) {
first_stage_model.destroy();
}
}
bool load_from_file(const std::string& model_path,
const std::string& vae_path,
ggml_type wtype,
Schedule schedule,
int clip_skip) {
#ifdef SD_USE_CUBLAS
LOG_DEBUG("Using CUDA backend");
backend = ggml_backend_cuda_init(0);
#endif
#ifdef SD_USE_METAL
LOG_DEBUG("Using Metal backend");
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
backend = ggml_backend_metal_init();
#endif
if (!backend) {
LOG_DEBUG("Using CPU backend");
backend = ggml_backend_cpu_init();
}
#ifdef SD_USE_FLASH_ATTENTION
#if defined(SD_USE_CUBLAS) || defined(SD_USE_METAL)
LOG_WARN("Flash Attention not supported with GPU Backend");
#else
LOG_INFO("Flash Attention enabled");
#endif
#endif
LOG_INFO("loading model from '%s'", model_path.c_str());
ModelLoader model_loader;
if (!model_loader.init_from_file(model_path)) {
LOG_ERROR("init model loader from file failed: '%s'", model_path.c_str());
return false;
}
if (vae_path.size() > 0) {
LOG_INFO("loading vae from '%s'", vae_path.c_str());
if (!model_loader.init_from_file(vae_path, "vae.")) {
LOG_WARN("loading vae from '%s' failed", vae_path.c_str());
}
}
version = model_loader.get_sd_version();
if (version == VERSION_COUNT) {
LOG_ERROR("get sd version from file failed: '%s'", model_path.c_str());
return false;
}
if (version == VERSION_XL) {
scale_factor = 0.13025f;
}
cond_stage_model = FrozenCLIPEmbedderWithCustomWords(version, clip_skip);
diffusion_model = UNetModel(version);
LOG_INFO("Stable Diffusion %s ", model_version_to_str[version]);
if (wtype == GGML_TYPE_COUNT) {
model_data_type = model_loader.get_sd_wtype();
} else {
model_data_type = wtype;
}
LOG_INFO("Stable Diffusion weight type: %s", ggml_type_name(model_data_type));
LOG_DEBUG("loading vocab");
std::string merges_utf8_str = model_loader.load_merges();
if (merges_utf8_str.size() == 0) {
LOG_ERROR("get merges failed: '%s'", model_path.c_str());
return false;
}
cond_stage_model.tokenizer.load_from_merges(merges_utf8_str);
// create the ggml context for network params
LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
if (
!cond_stage_model.initialize(backend, model_data_type) ||
!diffusion_model.initialize(backend, model_data_type)) {
return false;
}
ggml_type vae_type = model_data_type;
if (version == VERSION_XL) {
vae_type = GGML_TYPE_F32; // avoid nan, not work...
}
if (!use_tiny_autoencoder && !first_stage_model.initialize(backend, vae_type)) {
return false;
}
LOG_DEBUG("preparing memory for the weights");
// prepare memory for the weights
{
// cond_stage_model(FrozenCLIPEmbedder)
cond_stage_model.alloc_params();
cond_stage_model.map_by_name(tensors, "cond_stage_model.");
// diffusion_model(UNetModel)
diffusion_model.alloc_params();
diffusion_model.map_by_name(tensors, "model.diffusion_model.");
if (!use_tiny_autoencoder) {
// firest_stage_model(AutoEncoderKL)
first_stage_model.alloc_params();
first_stage_model.map_by_name(tensors, "first_stage_model.");
}
}
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; // 10M
params.mem_buffer = NULL;
params.no_alloc = false;
// LOG_DEBUG("mem_size %u ", params.mem_size);
struct ggml_context* ctx = ggml_init(params); // for alphas_cumprod and is_using_v_parameterization check
if (!ctx) {
LOG_ERROR("ggml_init() failed");
return false;
}
ggml_tensor* alphas_cumprod_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, TIMESTEPS);
calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);
// load weights
LOG_DEBUG("loading weights");
std::set<std::string> tensor_names_in_file;
int64_t t0 = ggml_time_ms();
size_t total_size = 0;
std::vector<char> read_buf;
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
const std::string& name = tensor_storage.name;
tensor_names_in_file.insert(name);
if (name == "alphas_cumprod") {
*dst_tensor = alphas_cumprod_tensor;
return true;
}
struct ggml_tensor* real;
if (tensors.find(name) != tensors.end()) {
real = tensors[name];
} else {
if (use_tiny_autoencoder && starts_with(name, "first_stage_model.")) {
return true;
}
if (name.find("quant") == std::string::npos && name.find("first_stage_model.encoder.") == std::string::npos) {
LOG_WARN("unknown tensor '%s' in model file", name.data());
} else {
if (!vae_decode_only) {
LOG_WARN("unknown tensor '%s' in model file", name.data());
}
}
return true;
}
if (
real->ne[0] != tensor_storage.ne[0] ||
real->ne[1] != tensor_storage.ne[1] ||
real->ne[2] != tensor_storage.ne[2] ||
real->ne[3] != tensor_storage.ne[3]) {
LOG_ERROR(
"tensor '%s' has wrong shape in model file: "
"got [%d, %d, %d, %d], expected [%d, %d, %d, %d]",
name.c_str(),
(int)tensor_storage.ne[0], (int)tensor_storage.ne[1], (int)tensor_storage.ne[2], (int)tensor_storage.ne[3],
(int)real->ne[0], (int)real->ne[1], (int)real->ne[2], (int)real->ne[3]);
return false;
}
*dst_tensor = real;
total_size += ggml_nbytes(real);
return true;
};
// print_ggml_tensor(alphas_cumprod_tensor);
bool success = model_loader.load_tensors(on_new_tensor_cb, backend);
if (!success) {
LOG_ERROR("load tensors from file failed");
ggml_free(ctx);
return false;
}
// print_ggml_tensor(alphas_cumprod_tensor);
// calculate_alphas_cumprod((float*)alphas_cumprod_tensor->data);
bool some_tensor_not_init = false;
for (auto pair : tensors) {
if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) {
continue;
}
if (use_tiny_autoencoder && starts_with(pair.first, "first_stage_model.")) {
continue;
}
if (tensor_names_in_file.find(pair.first) == tensor_names_in_file.end()) {
LOG_ERROR("tensor '%s' not in model file", pair.first.c_str());
some_tensor_not_init = true;
}
}
if (some_tensor_not_init) {
ggml_free(ctx);
return false;
}
LOG_DEBUG("model size = %.2fMB", total_size / 1024.0 / 1024.0);
size_t total_params_size =
cond_stage_model.memory_buffer_size +
diffusion_model.memory_buffer_size +
first_stage_model.memory_buffer_size;
LOG_INFO("total memory buffer size = %.2fMB (clip %.2fMB, unet %.2fMB, vae %.2fMB)",
total_params_size / 1024.0 / 1024.0,
cond_stage_model.memory_buffer_size / 1024.0 / 1024.0,
diffusion_model.memory_buffer_size / 1024.0 / 1024.0,
first_stage_model.memory_buffer_size / 1024.0 / 1024.0);
int64_t t1 = ggml_time_ms();
LOG_INFO("loading model from '%s' completed, taking %.2fs", model_path.c_str(), (t1 - t0) * 1.0f / 1000);
// check is_using_v_parameterization_for_sd2
bool is_using_v_parameterization = false;
if (version == VERSION_2_x) {
if (is_using_v_parameterization_for_sd2(ctx)) {
is_using_v_parameterization = true;
}
}
if (is_using_v_parameterization) {
denoiser = std::make_shared<CompVisVDenoiser>();
LOG_INFO("running in v-prediction mode");
} else {
LOG_INFO("running in eps-prediction mode");
}
if (schedule != DEFAULT) {
switch (schedule) {
case DISCRETE:
LOG_INFO("running with discrete schedule");
denoiser->schedule = std::make_shared<DiscreteSchedule>();
break;
case KARRAS:
LOG_INFO("running with Karras schedule");
denoiser->schedule = std::make_shared<KarrasSchedule>();
break;
case DEFAULT:
// Don't touch anything.
break;
default:
LOG_ERROR("Unknown schedule %i", schedule);
abort();
}
}
for (int i = 0; i < TIMESTEPS; i++) {
denoiser->schedule->alphas_cumprod[i] = ((float*)alphas_cumprod_tensor->data)[i];
denoiser->schedule->sigmas[i] = std::sqrt((1 - denoiser->schedule->alphas_cumprod[i]) / denoiser->schedule->alphas_cumprod[i]);
denoiser->schedule->log_sigmas[i] = std::log(denoiser->schedule->sigmas[i]);
}
LOG_DEBUG("finished loaded file");
ggml_free(ctx);
if (upscale_output) {
if (!esrgan_upscaler.load_from_file(esrgan_path, backend)) {
return false;
}
}
if (use_tiny_autoencoder) {
return tae_first_stage.load_from_file(taesd_path, backend);
}
return true;
}
bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) {
struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
ggml_set_f32(x_t, 0.5);
struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
ggml_set_f32(c, 0.5);
struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); // [N, ]
struct ggml_tensor* t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, diffusion_model.model_channels); // [N, model_channels]
diffusion_model.begin(x_t, c, t_emb);
int64_t t0 = ggml_time_ms();
ggml_set_f32(timesteps, 999);
set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels);
struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
diffusion_model.compute(out, n_threads, x_t, NULL, c, t_emb);
diffusion_model.end();
double result = 0.f;
{
float* vec_x = (float*)x_t->data;
float* vec_out = (float*)out->data;
int64_t n = ggml_nelements(out);
for (int i = 0; i < n; i++) {
result += ((double)vec_out[i] - (double)vec_x[i]);
}
result /= n;
}
int64_t t1 = ggml_time_ms();
LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000);
return result < -1;
}
void apply_lora(const std::string& lora_name, float multiplier) {
int64_t t0 = ggml_time_ms();
LoraModel lora;
std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors");
std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
std::string file_path;
if (file_exists(st_file_path)) {
file_path = st_file_path;
} else if (file_exists(ckpt_file_path)) {
file_path = ckpt_file_path;
} else {
LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
return;
}
if (!lora.load(backend, file_path)) {
LOG_WARN("load lora tensors from %s failed", file_path.c_str());
return;
}
lora.multiplier = multiplier;
lora.apply(tensors, n_threads);
loras[lora_name] = lora;
lora.release();
int64_t t1 = ggml_time_ms();
LOG_INFO("lora '%s' applied, taking %.2fs",
lora_name.c_str(),
(t1 - t0) * 1.0f / 1000);
}
void apply_loras(const std::unordered_map<std::string, float>& lora_state) {
if (lora_state.size() > 0 && model_data_type != GGML_TYPE_F16 && model_data_type != GGML_TYPE_F32) {
LOG_WARN("In quantized models when applying LoRA, the images have poor quality.");
}
std::unordered_map<std::string, float> lora_state_diff;
for (auto& kv : lora_state) {
const std::string& lora_name = kv.first;
float multiplier = kv.second;
if (curr_lora_state.find(lora_name) != curr_lora_state.end()) {
float curr_multiplier = curr_lora_state[lora_name];
float multiplier_diff = multiplier - curr_multiplier;
if (multiplier_diff != 0.f) {
lora_state_diff[lora_name] = multiplier_diff;
}
} else {
lora_state_diff[lora_name] = multiplier;
}
}
for (auto& kv : lora_state_diff) {
apply_lora(kv.first, kv.second);
}
curr_lora_state = lora_state;
}
std::pair<ggml_tensor*, ggml_tensor*> get_learned_condition(ggml_context* work_ctx, const std::string& text, int width, int height, bool force_zero_embeddings = false) {
auto tokens_and_weights = cond_stage_model.tokenize(text, true);
std::vector<int>& tokens = tokens_and_weights.first;
std::vector<float>& weights = tokens_and_weights.second;
int64_t t0 = ggml_time_ms();
cond_stage_model.begin(work_ctx, (int)tokens.size());
auto result_pair = cond_stage_model.compute(n_threads, tokens); // [N, n_token, hidden_size]
struct ggml_tensor* hidden_states = result_pair.first;
struct ggml_tensor* pooled = result_pair.second;
// if (pooled != NULL) {
// print_ggml_tensor(hidden_states);
// print_ggml_tensor(pooled);
// }
cond_stage_model.end();
int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
ggml_tensor* result = ggml_dup_tensor(work_ctx, hidden_states);
{
float original_mean = ggml_tensor_mean(hidden_states);
for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) {
for (int i1 = 0; i1 < hidden_states->ne[1]; i1++) {
for (int i0 = 0; i0 < hidden_states->ne[0]; i0++) {
float value = ggml_tensor_get_f32(hidden_states, i0, i1, i2);
value *= weights[i1];
ggml_tensor_set_f32(result, value, i0, i1, i2);
}
}
}
float new_mean = ggml_tensor_mean(result);
ggml_tensor_scale(result, (original_mean / new_mean));
}
if (force_zero_embeddings) {
float* vec = (float*)result->data;
for (int i = 0; i < ggml_nelements(result); i++) {
vec[i] = 0;
}
}
ggml_tensor* vec = NULL;
if (version == VERSION_XL) {
size_t out_dim = 256;
vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model.adm_in_channels);
// [0:1280]
size_t offset = 0;
memcpy(vec->data, pooled->data, ggml_nbytes(pooled));
offset += ggml_nbytes(pooled);
struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2);
// original_size_as_tuple
float orig_width = (float)width;
float orig_height = (float)height;
ggml_tensor_set_f32(timesteps, orig_height, 0);
ggml_tensor_set_f32(timesteps, orig_width, 1);
ggml_tensor* embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
offset += ggml_nbytes(embed_view);
set_timestep_embedding(timesteps, embed_view, out_dim);
// print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
// crop_coords_top_left
float crop_coord_top = 0.f;
float crop_coord_left = 0.f;
ggml_tensor_set_f32(timesteps, crop_coord_top, 0);
ggml_tensor_set_f32(timesteps, crop_coord_left, 1);
embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
offset += ggml_nbytes(embed_view);
set_timestep_embedding(timesteps, embed_view, out_dim);
// print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
// target_size_as_tuple
float target_width = (float)width;
float target_height = (float)height;
ggml_tensor_set_f32(timesteps, target_height, 0);
ggml_tensor_set_f32(timesteps, target_width, 1);
embed_view = ggml_view_2d(work_ctx, vec, out_dim, 2, ggml_type_size(GGML_TYPE_F32) * out_dim, offset);
offset += ggml_nbytes(embed_view);
set_timestep_embedding(timesteps, embed_view, out_dim);
// print_ggml_tensor(ggml_reshape_1d(work_ctx, embed_view, out_dim * 2));
GGML_ASSERT(offset == ggml_nbytes(vec));
}
// print_ggml_tensor(result);
return {result, vec};
}
ggml_tensor* sample(ggml_context* work_ctx,
ggml_tensor* x_t,
ggml_tensor* noise,
ggml_tensor* c,
ggml_tensor* c_vector,
ggml_tensor* uc,
ggml_tensor* uc_vector,
float cfg_scale,
SampleMethod method,
const std::vector<float>& sigmas) {
size_t steps = sigmas.size() - 1;
// x_t = load_tensor_from_file(work_ctx, "./rand0.bin");
// print_ggml_tensor(x_t);
struct ggml_tensor* x = ggml_dup_tensor(work_ctx, x_t);
copy_ggml_tensor(x, x_t);
struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t);
struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); // [N, ]
struct ggml_tensor* t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, diffusion_model.model_channels); // [N, model_channels]
diffusion_model.begin(noised_input, c, t_emb, c_vector);
bool has_unconditioned = cfg_scale != 1.0 && uc != NULL;
if (noise == NULL) {
// x = x * sigmas[0]
ggml_tensor_scale(x, sigmas[0]);
} else {
// xi = x + noise * sigma_sched[0]
ggml_tensor_scale(noise, sigmas[0]);
ggml_tensor_add(x, noise);
}
// denoise wrapper
struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* out_uncond = NULL;
if (has_unconditioned) {
out_uncond = ggml_dup_tensor(work_ctx, x);
}
struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
auto denoise = [&](ggml_tensor* input, float sigma, int step) {
if (step == 1) {
pretty_progress(0, (int)steps, 0);
}
int64_t t0 = ggml_time_us();
float c_skip = 1.0f;
float c_out = 1.0f;
float c_in = 1.0f;
std::vector<float> scaling = denoiser->get_scalings(sigma);
if (scaling.size() == 3) { // CompVisVDenoiser
c_skip = scaling[0];
c_out = scaling[1];
c_in = scaling[2];
} else { // CompVisDenoiser
c_out = scaling[0];
c_in = scaling[1];
}
float t = denoiser->schedule->sigma_to_t(sigma);
ggml_set_f32(timesteps, t);
set_timestep_embedding(timesteps, t_emb, diffusion_model.model_channels);
copy_ggml_tensor(noised_input, input);
// noised_input = noised_input * c_in
ggml_tensor_scale(noised_input, c_in);
// cond
diffusion_model.compute(out_cond, n_threads, noised_input, NULL, c, t_emb, c_vector);
float* negative_data = NULL;
if (has_unconditioned) {
// uncond
diffusion_model.compute(out_uncond, n_threads, noised_input, NULL, uc, t_emb, uc_vector);
negative_data = (float*)out_uncond->data;
}
float* vec_denoised = (float*)denoised->data;
float* vec_input = (float*)input->data;
float* positive_data = (float*)out_cond->data;
int ne_elements = (int)ggml_nelements(denoised);
for (int i = 0; i < ne_elements; i++) {
float latent_result = positive_data[i];
if (has_unconditioned) {
// out_uncond + cfg_scale * (out_cond - out_uncond)
latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
}
// v = latent_result, eps = latent_result
// denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
}
int64_t t1 = ggml_time_us();
if (step > 0) {
pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
// LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
}
};
// sample_euler_ancestral
switch (method) {
case EULER_A: {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
// denoise
denoise(x, sigma, i + 1);
// d = (x - denoised) / sigma
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
for (int i = 0; i < ggml_nelements(d); i++) {
vec_d[i] = (vec_x[i] - vec_denoised[i]) / sigma;
}
}
// get_ancestral_step
float sigma_up = std::min(sigmas[i + 1],
std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
// Euler method
float dt = sigma_down - sigmas[i];
// x = x + d * dt
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
for (int i = 0; i < ggml_nelements(x); i++) {
vec_x[i] = vec_x[i] + vec_d[i] * dt;
}
}
if (sigmas[i + 1] > 0) {
// x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
ggml_tensor_set_f32_randn(noise, rng);
// noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin");
{
float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data;
for (int i = 0; i < ggml_nelements(x); i++) {
vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
}
}
}
}
} break;
case EULER: // Implemented without any sigma churn
{
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
// denoise
denoise(x, sigma, i + 1);
// d = (x - denoised) / sigma
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(d); j++) {
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigma;
}
}
float dt = sigmas[i + 1] - sigma;
// x = x + d * dt
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = vec_x[j] + vec_d[j] * dt;
}
}
}
} break;
case HEUN: {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) {
// denoise
denoise(x, sigmas[i], -(i + 1));
// d = (x - denoised) / sigma
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
}
}
float dt = sigmas[i + 1] - sigmas[i];
if (sigmas[i + 1] == 0) {
// Euler step
// x = x + d * dt
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = vec_x[j] + vec_d[j] * dt;
}
} else {
// Heun step
float* vec_d = (float*)d->data;
float* vec_d2 = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_x2 = (float*)x2->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x2[j] = vec_x[j] + vec_d[j] * dt;
}
denoise(x2, sigmas[i + 1], i + 1);
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
vec_d[j] = (vec_d[j] + d2) / 2;
vec_x[j] = vec_x[j] + vec_d[j] * dt;
}
}
}
} break;
case DPM2: {
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) {
// denoise
denoise(x, sigmas[i], i + 1);
// d = (x - denoised) / sigma
{
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
}
}
if (sigmas[i + 1] == 0) {
// Euler step
// x = x + d * dt
float dt = sigmas[i + 1] - sigmas[i];
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = vec_x[j] + vec_d[j] * dt;
}
} else {
// DPM-Solver-2
float sigma_mid = exp(0.5f * (log(sigmas[i]) + log(sigmas[i + 1])));
float dt_1 = sigma_mid - sigmas[i];
float dt_2 = sigmas[i + 1] - sigmas[i];
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_x2 = (float*)x2->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x2[j] = vec_x[j] + vec_d[j] * dt_1;
}
denoise(x2, sigma_mid, i + 1);
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
vec_x[j] = vec_x[j] + d2 * dt_2;
}
}
}
} break;
case DPMPP2S_A: {
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* x2 = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) {
// denoise
denoise(x, sigmas[i], i + 1);
// get_ancestral_step
float sigma_up = std::min(sigmas[i + 1],
std::sqrt(sigmas[i + 1] * sigmas[i + 1] * (sigmas[i] * sigmas[i] - sigmas[i + 1] * sigmas[i + 1]) / (sigmas[i] * sigmas[i])));
float sigma_down = std::sqrt(sigmas[i + 1] * sigmas[i + 1] - sigma_up * sigma_up);
auto t_fn = [](float sigma) -> float { return -log(sigma); };
auto sigma_fn = [](float t) -> float { return exp(-t); };
if (sigma_down == 0) {
// Euler step
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(d); j++) {
vec_d[j] = (vec_x[j] - vec_denoised[j]) / sigmas[i];
}
// TODO: If sigma_down == 0, isn't this wrong?
// But
// https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py#L525
// has this exactly the same way.
float dt = sigma_down - sigmas[i];
for (int j = 0; j < ggml_nelements(d); j++) {
vec_x[j] = vec_x[j] + vec_d[j] * dt;
}
} else {
// DPM-Solver++(2S)
float t = t_fn(sigmas[i]);
float t_next = t_fn(sigma_down);
float h = t_next - t;
float s = t + 0.5f * h;
float* vec_d = (float*)d->data;
float* vec_x = (float*)x->data;
float* vec_x2 = (float*)x2->data;
float* vec_denoised = (float*)denoised->data;
// First half-step
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x2[j] = (sigma_fn(s) / sigma_fn(t)) * vec_x[j] - (exp(-h * 0.5f) - 1) * vec_denoised[j];
}
denoise(x2, sigmas[i + 1], i + 1);
// Second half-step
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = (sigma_fn(t_next) / sigma_fn(t)) * vec_x[j] - (exp(-h) - 1) * vec_denoised[j];
}
}
// Noise addition
if (sigmas[i + 1] > 0) {
ggml_tensor_set_f32_randn(noise, rng);
{
float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data;
for (int i = 0; i < ggml_nelements(x); i++) {
vec_x[i] = vec_x[i] + vec_noise[i] * sigma_up;
}
}
}
}
} break;
case DPMPP2M: // DPM++ (2M) from Karras et al (2022)
{
struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
auto t_fn = [](float sigma) -> float { return -log(sigma); };
for (int i = 0; i < steps; i++) {
// denoise
denoise(x, sigmas[i], i + 1);
float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]);
float h = t_next - t;
float a = sigmas[i + 1] / sigmas[i];
float b = exp(-h) - 1.f;
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
float* vec_old_denoised = (float*)old_denoised->data;
if (i == 0 || sigmas[i + 1] == 0) {
// Simpler step for the edge cases
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
}
} else {
float h_last = t - t_fn(sigmas[i - 1]);
float r = h_last / h;
for (int j = 0; j < ggml_nelements(x); j++) {
float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
vec_x[j] = a * vec_x[j] - b * denoised_d;
}
}
// old_denoised = denoised
for (int j = 0; j < ggml_nelements(x); j++) {
vec_old_denoised[j] = vec_denoised[j];
}
}
} break;
case DPMPP2Mv2: // Modified DPM++ (2M) from https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457
{
struct ggml_tensor* old_denoised = ggml_dup_tensor(work_ctx, x);
auto t_fn = [](float sigma) -> float { return -log(sigma); };
for (int i = 0; i < steps; i++) {
// denoise
denoise(x, sigmas[i], i + 1);
float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]);
float h = t_next - t;
float a = sigmas[i + 1] / sigmas[i];
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
float* vec_old_denoised = (float*)old_denoised->data;
if (i == 0 || sigmas[i + 1] == 0) {
// Simpler step for the edge cases
float b = exp(-h) - 1.f;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = a * vec_x[j] - b * vec_denoised[j];
}
} else {
float h_last = t - t_fn(sigmas[i - 1]);
float h_min = std::min(h_last, h);
float h_max = std::max(h_last, h);
float r = h_max / h_min;
float h_d = (h_max + h_min) / 2.f;
float b = exp(-h_d) - 1.f;
for (int j = 0; j < ggml_nelements(x); j++) {
float denoised_d = (1.f + 1.f / (2.f * r)) * vec_denoised[j] - (1.f / (2.f * r)) * vec_old_denoised[j];
vec_x[j] = a * vec_x[j] - b * denoised_d;
}
}
// old_denoised = denoised
for (int j = 0; j < ggml_nelements(x); j++) {
vec_old_denoised[j] = vec_denoised[j];
}
}
} break;
case LCM: // Latent Consistency Models
{
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);
for (int i = 0; i < steps; i++) {
float sigma = sigmas[i];
// denoise
denoise(x, sigma, i + 1);
// x = denoised
{
float* vec_x = (float*)x->data;
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = vec_denoised[j];
}
}
if (sigmas[i + 1] > 0) {
// x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1])
ggml_tensor_set_f32_randn(noise, rng);
// noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
{
float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data;
for (int j = 0; j < ggml_nelements(x); j++) {
vec_x[j] = vec_x[j] + sigmas[i + 1] * vec_noise[j];
}
}
}
}
} break;
default:
LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);
abort();
}
diffusion_model.end();
return x;
}
// ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
// ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
ggml_tensor_set_f32_randn(noise, rng);
// noise = load_tensor_from_file(work_ctx, "noise.bin");
{
float mean = 0;
float logvar = 0;
float value = 0;
float std_ = 0;
for (int i = 0; i < latent->ne[3]; i++) {
for (int j = 0; j < latent->ne[2]; j++) {
for (int k = 0; k < latent->ne[1]; k++) {
for (int l = 0; l < latent->ne[0]; l++) {
mean = ggml_tensor_get_f32(moments, l, k, j, i);
logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
logvar = std::max(-30.0f, std::min(logvar, 20.0f));
std_ = std::exp(0.5f * logvar);
value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
value = value * scale_factor;
// printf("%d %d %d %d -> %f\n", i, j, k, l, value);
ggml_tensor_set_f32(latent, value, l, k, j, i);
}
}
}
}
}
return latent;
}
ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) {
int64_t W = x->ne[0];
int64_t H = x->ne[1];
ggml_tensor* result = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32,
decode ? (W * 8) : (W / 8), // width
decode ? (H * 8) : (H / 8), // height
decode ? 3 : (use_tiny_autoencoder ? 4 : 8)); // channels
int64_t t0 = ggml_time_ms();
if (!use_tiny_autoencoder) {
if (decode) {
ggml_tensor_scale(x, 1.0f / scale_factor);
} else {
ggml_tensor_scale_input(x);
}
if (vae_tiling && decode) { // TODO: support tiling vae encode
// split latent in 32x32 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
if (init) {
first_stage_model.begin(in, decode);
} else {
first_stage_model.compute(out, n_threads, in, decode);
}
};
sd_tiling(x, result, 8, 32, 0.5f, on_tiling);
} else {
first_stage_model.begin(x, decode);
first_stage_model.compute(result, n_threads, x, decode);
}
first_stage_model.end();
if (decode) {
ggml_tensor_scale_output(result);
}
} else {
if (vae_tiling && decode) { // TODO: support tiling vae encode
// split latent in 64x64 tiles and compute in several steps
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
if (init) {
tae_first_stage.begin(in, decode);
} else {
tae_first_stage.compute(out, n_threads, in, decode);
}
};
sd_tiling(x, result, 8, 64, 0.5f, on_tiling);
} else {
tae_first_stage.begin(x, decode);
tae_first_stage.compute(result, n_threads, x, decode);
}
tae_first_stage.end();
}
int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000);
if (decode) {
ggml_tensor_clamp(result, 0.0f, 1.0f);
}
return result;
}
uint8_t* upscale(ggml_tensor* image) {
int output_width = image->ne[0] * esrgan_upscaler.scale;
int output_height = image->ne[1] * esrgan_upscaler.scale;
LOG_INFO("upscaling from (%i x %i) to (%i x %i)", image->ne[0], image->ne[1], output_width, output_height);
struct ggml_init_params params;
params.mem_size = output_width * output_height * 3 * sizeof(float); // upscaled
params.mem_size += 1 * ggml_tensor_overhead();
params.mem_buffer = NULL;
params.no_alloc = false;
// draft context
struct ggml_context* upscale_ctx = ggml_init(params);
if (!upscale_ctx) {
LOG_ERROR("ggml_init() failed");
return NULL;
}
LOG_DEBUG("upscale work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, image->ne[2], 1);
auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
if (init) {
esrgan_upscaler.begin(in);
} else {
esrgan_upscaler.compute(out, n_threads, in);
}
};
int64_t t0 = ggml_time_ms();
sd_tiling(image, upscaled, esrgan_upscaler.scale, esrgan_upscaler.tile_size, 0.25f, on_tiling);
esrgan_upscaler.end();
ggml_tensor_clamp(upscaled, 0.f, 1.f);
uint8_t* upscaled_data = sd_tensor_to_image(upscaled);
ggml_free(upscale_ctx);
int64_t t3 = ggml_time_ms();
LOG_INFO("image upscaled, taking %.2fs", (t3 - t0) / 1000.0f);
return upscaled_data;
}
ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
return compute_first_stage(work_ctx, x, false);
}
ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
return compute_first_stage(work_ctx, x, true);
}
};
/*================================================= StableDiffusion ==================================================*/
StableDiffusion::StableDiffusion(int n_threads,
bool vae_decode_only,
std::string taesd_path,
std::string esrgan_path,
bool free_params_immediately,
bool vae_tiling,
std::string lora_model_dir,
RNGType rng_type) {
sd = std::make_shared<StableDiffusionGGML>(n_threads,
vae_decode_only,
free_params_immediately,
lora_model_dir,
rng_type);
sd->use_tiny_autoencoder = taesd_path.size() > 0;
sd->taesd_path = taesd_path;
sd->upscale_output = esrgan_path.size() > 0;
sd->esrgan_path = esrgan_path;
sd->vae_tiling = vae_tiling;
}
bool StableDiffusion::load_from_file(const std::string& model_path,
const std::string& vae_path,
ggml_type wtype,
Schedule s,
int clip_skip) {
return sd->load_from_file(model_path, vae_path, wtype, s, clip_skip);
}
std::vector<uint8_t*> StableDiffusion::txt2img(std::string prompt,
std::string negative_prompt,
float cfg_scale,
int width,
int height,
SampleMethod sample_method,
int sample_steps,
int64_t seed,
int batch_count) {
std::vector<uint8_t*> results;
// if (width >= 1024 && height >= 1024) { // 1024 x 1024 images
// LOG_WARN("Image too large, try a smaller size.");
// return results;
// }
// extract and remove lora
auto result_pair = extract_and_remove_lora(prompt);
std::unordered_map<std::string, float> lora_f2m = result_pair.first; // lora_name -> multiplier
for (auto& kv : lora_f2m) {
LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
}
prompt = result_pair.second;
LOG_DEBUG("prompt after extract and remove lora: \"%s\"", prompt.c_str());
int64_t t0 = ggml_time_ms();
sd->apply_loras(lora_f2m);
int64_t t1 = ggml_time_ms();
LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024 * 1024); // 10 MB
params.mem_size += width * height * 3 * sizeof(float);
params.mem_size *= batch_count;
params.mem_buffer = NULL;
params.no_alloc = false;
// LOG_DEBUG("mem_size %u ", params.mem_size);
struct ggml_context* work_ctx = ggml_init(params);
if (!work_ctx) {
LOG_ERROR("ggml_init() failed");
return results;
}
if (seed < 0) {
// Generally, when using the provided command line, the seed is always >0.
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
// by a third party with a seed <0, let's incorporate randomization here.
srand((int)time(NULL));
seed = rand();
}
t0 = ggml_time_ms();
auto cond_pair = sd->get_learned_condition(work_ctx, prompt, width, height);
ggml_tensor* c = cond_pair.first;
ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ]
struct ggml_tensor* uc = NULL;
struct ggml_tensor* uc_vector = NULL;
if (cfg_scale != 1.0) {
bool force_zero_embeddings = false;
if (sd->version == VERSION_XL && negative_prompt.size() == 0) {
force_zero_embeddings = true;
}
auto uncond_pair = sd->get_learned_condition(work_ctx, negative_prompt, width, height, force_zero_embeddings);
uc = uncond_pair.first;
uc_vector = uncond_pair.second; // [adm_in_channels, ]
}
t1 = ggml_time_ms();
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
if (sd->free_params_immediately) {
sd->cond_stage_model.destroy();
}
std::vector<struct ggml_tensor*> final_latents; // collect latents to decode
int C = 4;
int W = width / 8;
int H = height / 8;
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
for (int b = 0; b < batch_count; b++) {
int64_t sampling_start = ggml_time_ms();
int cur_seed = seed + b;
LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed);
sd->rng->manual_seed(cur_seed);
struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
ggml_tensor_set_f32_randn(x_t, sd->rng);
std::vector<float> sigmas = sd->denoiser->schedule->get_sigmas(sample_steps);
struct ggml_tensor* x_0 = sd->sample(work_ctx, x_t, NULL, c, c_vector, uc, uc_vector, cfg_scale, sample_method, sigmas);
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
// print_ggml_tensor(x_0);
int64_t sampling_end = ggml_time_ms();
LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
final_latents.push_back(x_0);
}
if (sd->free_params_immediately) {
sd->diffusion_model.destroy();
}
int64_t t3 = ggml_time_ms();
LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000);
LOG_INFO("decoding %zu latents", final_latents.size());
std::vector<struct ggml_tensor*> decoded_images; // collect decoded images
for (size_t i = 0; i < final_latents.size(); i++) {
t1 = ggml_time_ms();
struct ggml_tensor* img = sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
// print_ggml_tensor(img);
if (img != NULL) {
decoded_images.push_back(img);
}
int64_t t2 = ggml_time_ms();
LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
}
int64_t t4 = ggml_time_ms();
LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
if (sd->free_params_immediately && !sd->use_tiny_autoencoder) {
sd->first_stage_model.destroy();
}
if (sd->upscale_output) {
LOG_INFO("upscaling %" PRId64 " images", decoded_images.size());
}
for (size_t i = 0; i < decoded_images.size(); i++) {
if (sd->upscale_output) {
results.push_back(sd->upscale(decoded_images[i]));
} else {
results.push_back(sd_tensor_to_image(decoded_images[i]));
}
}
ggml_free(work_ctx);
LOG_INFO(
"txt2img completed in %.2fs",
(t4 - t0) * 1.0f / 1000);
return results;
}
std::vector<uint8_t*> StableDiffusion::img2img(const uint8_t* init_img_data,
std::string prompt,
std::string negative_prompt,
float cfg_scale,
int width,
int height,
SampleMethod sample_method,
int sample_steps,
float strength,
int64_t seed) {
std::vector<uint8_t*> result;
LOG_INFO("img2img %dx%d", width, height);
std::vector<float> sigmas = sd->denoiser->schedule->get_sigmas(sample_steps);
size_t t_enc = static_cast<size_t>(sample_steps * strength);
LOG_INFO("target t_enc is %zu steps", t_enc);
std::vector<float> sigma_sched;
sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());
struct ggml_init_params params;
params.mem_size = static_cast<size_t>(10 * 1024) * 1024; // 10 MB
params.mem_size += width * height * 3 * sizeof(float) * 2;
params.mem_buffer = NULL;
params.no_alloc = false;
// LOG_DEBUG("mem_size %u ", params.mem_size);
// draft context
struct ggml_context* work_ctx = ggml_init(params);
if (!work_ctx) {
LOG_ERROR("ggml_init() failed");
return result;
}
if (seed < 0) {
seed = (int)time(NULL);
}
sd->rng->manual_seed(seed);
// extract and remove lora
auto result_pair = extract_and_remove_lora(prompt);
std::unordered_map<std::string, float> lora_f2m = result_pair.first; // lora_name -> multiplier
for (auto& kv : lora_f2m) {
LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
}
prompt = result_pair.second;
LOG_DEBUG("prompt after extract and remove lora: \"%s\"", prompt.c_str());
// load lora from file
int64_t t0 = ggml_time_ms();
sd->apply_loras(lora_f2m);
int64_t t1 = ggml_time_ms();
LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
sd_image_to_tensor(init_img_data, init_img);
t0 = ggml_time_ms();
ggml_tensor* init_latent = NULL;
if (!sd->use_tiny_autoencoder) {
ggml_tensor* moments = sd->encode_first_stage(work_ctx, init_img);
init_latent = sd->get_first_stage_encoding(work_ctx, moments);
} else {
init_latent = sd->encode_first_stage(work_ctx, init_img);
}
// print_ggml_tensor(init_latent);
t1 = ggml_time_ms();
LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
auto cond_pair = sd->get_learned_condition(work_ctx, prompt, width, height);
ggml_tensor* c = cond_pair.first;
ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ]
struct ggml_tensor* uc = NULL;
struct ggml_tensor* uc_vector = NULL;
if (cfg_scale != 1.0) {
bool force_zero_embeddings = false;
if (sd->version == VERSION_XL && negative_prompt.size() == 0) {
force_zero_embeddings = true;
}
auto uncond_pair = sd->get_learned_condition(work_ctx, negative_prompt, width, height, force_zero_embeddings);
uc = uncond_pair.first;
uc_vector = uncond_pair.second; // [adm_in_channels, ]
}
int64_t t2 = ggml_time_ms();
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1);
if (sd->free_params_immediately) {
sd->cond_stage_model.destroy();
}
// SDXL
// requires encode_adm
// apply set_timestep_embedding with dim 256
sd->rng->manual_seed(seed);
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_latent);
ggml_tensor_set_f32_randn(noise, sd->rng);
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
struct ggml_tensor* x_0 = sd->sample(work_ctx, init_latent, noise, c, c_vector, uc, uc_vector, cfg_scale, sample_method, sigma_sched);
// struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
// print_ggml_tensor(x_0);
int64_t t3 = ggml_time_ms();
LOG_INFO("sampling completed, taking %.2fs", (t3 - t2) * 1.0f / 1000);
if (sd->free_params_immediately) {
sd->diffusion_model.destroy();
}
struct ggml_tensor* img = sd->decode_first_stage(work_ctx, x_0);
if (img != NULL) {
if (sd->upscale_output) {
result.push_back(sd->upscale(img));
} else {
result.push_back(sd_tensor_to_image(img));
}
}
int64_t t4 = ggml_time_ms();
LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
if (sd->free_params_immediately && !sd->use_tiny_autoencoder) {
sd->first_stage_model.destroy();
}
LOG_INFO(
"img2img completed in %.2fs",
(t4 - t0) * 1.0f / 1000);
ggml_free(work_ctx);
return result;
}