
* first efforts at implementing photomaker; lots more to do * added PhotoMakerIDEncoder model in SD * fixed soem bugs; now photomaker model weights can be loaded into their tensor buffers * added input id image loading * added preprocessing inpit id images * finished get_num_tensors * fixed a bug in remove_duplicates * add a get_learned_condition_with_trigger function to do photomaker stuff * add a convert_token_to_id function for photomaker to extract trigger word's token id * making progress; need to implement tokenizer decoder * making more progress; finishing vision model forward * debugging vision_model outputs * corrected clip vision model output * continue making progress in id fusion process * finished stacked id embedding; to be tested * remove garbage file * debuging graph compute * more progress; now alloc buffer failed * fixed wtype issue; input images can only be 1 because issue with transformer when batch size > 1 (to be investigated) * added delayed subject conditioning; now photomaker runs and generates images * fixed stat_merge_step * added photomaker lora model (to be tested) * reworked pmid lora * finished applying pmid lora; to be tested * finalized pmid lora * add a few print tensor; tweak in sample again * small tweak; still not getting ID faces * fixed a bug in FuseBlock forward; also remove diag_mask op in for vision transformer; getting better results * disable pmid lora apply for now; 1 input image seems working; > 1 not working * turn pmid lora apply back on * fixed a decode bug * fixed a bug in ggml's conv_2d, and now > 1 input images working * add style_ratio as a cli param; reworked encode with trigger for attention weights * merge commit fixing lora free param buffer error * change default style ratio to 10% * added an option to offload vae decoder to CPU for mem-limited gpus * removing image normalization step seems making ID fidelity much higher * revert default style ratio back ro 20% * added an option for normalizing input ID images; cleaned up debugging code * more clean up * fixed bugs; now failed with cuda error; likely out-of-mem on GPU * free pmid model params when required * photomaker working properly now after merging and adapting to GGMLBlock API * remove tensor renaming; fixing names in the photomaker model file * updated README.md to include instructions and notes for running PhotoMaker * a bit clean up * remove -DGGML_CUDA_FORCE_MMQ; more clean up and README update * add input image requirement in README * bring back freeing pmid lora params buffer; simply pooled output of CLIPvision * remove MultiheadAttention2; customized MultiheadAttention * added a WIN32 get_files_from_dir; turn off Photomakder if receiving no input images * update docs * fix ci error * make stable-diffusion.h a pure c header file This reverts commit 27887b630db6a92f269f0aef8de9bc9832ab50a9. * fix ci error * format code * reuse get_learned_condition * reuse pad_tokens * reuse CLIPVisionModel * reuse LoraModel * add --clip-on-cpu * fix lora name conversion for SDXL --------- Co-authored-by: bssrdf <bssrdf@gmail.com> Co-authored-by: leejet <leejet714@gmail.com>
175 lines
7.0 KiB
C++
175 lines
7.0 KiB
C++
#ifndef __LORA_HPP__
|
|
#define __LORA_HPP__
|
|
|
|
#include "ggml_extend.hpp"
|
|
|
|
#define LORA_GRAPH_SIZE 10240
|
|
|
|
struct LoraModel : public GGMLModule {
|
|
float multiplier = 1.0f;
|
|
std::map<std::string, struct ggml_tensor*> lora_tensors;
|
|
std::string file_path;
|
|
ModelLoader model_loader;
|
|
bool load_failed = false;
|
|
|
|
LoraModel(ggml_backend_t backend,
|
|
ggml_type wtype,
|
|
const std::string& file_path = "",
|
|
const std::string& prefix = "")
|
|
: file_path(file_path), GGMLModule(backend, wtype) {
|
|
if (!model_loader.init_from_file(file_path, prefix)) {
|
|
load_failed = true;
|
|
}
|
|
}
|
|
|
|
std::string get_desc() {
|
|
return "lora";
|
|
}
|
|
|
|
size_t get_params_num() {
|
|
return LORA_GRAPH_SIZE;
|
|
}
|
|
|
|
size_t get_params_mem_size() {
|
|
return model_loader.get_params_mem_size(NULL);
|
|
}
|
|
|
|
bool load_from_file(bool filter_tensor = false) {
|
|
LOG_INFO("loading LoRA from '%s'", file_path.c_str());
|
|
|
|
if (load_failed) {
|
|
LOG_ERROR("init lora model loader from file failed: '%s'", file_path.c_str());
|
|
return false;
|
|
}
|
|
|
|
bool dry_run = true;
|
|
auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
|
|
const std::string& name = tensor_storage.name;
|
|
|
|
if (filter_tensor && !contains(name, "lora")) {
|
|
// LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
|
|
return true;
|
|
}
|
|
|
|
if (dry_run) {
|
|
struct ggml_tensor* real = ggml_new_tensor(params_ctx,
|
|
tensor_storage.type,
|
|
tensor_storage.n_dims,
|
|
tensor_storage.ne);
|
|
lora_tensors[name] = real;
|
|
} else {
|
|
auto real = lora_tensors[name];
|
|
*dst_tensor = real;
|
|
}
|
|
|
|
return true;
|
|
};
|
|
|
|
model_loader.load_tensors(on_new_tensor_cb, backend);
|
|
alloc_params_buffer();
|
|
|
|
dry_run = false;
|
|
model_loader.load_tensors(on_new_tensor_cb, backend);
|
|
|
|
LOG_DEBUG("finished loaded lora");
|
|
return true;
|
|
}
|
|
|
|
struct ggml_cgraph* build_graph(std::map<std::string, struct ggml_tensor*> model_tensors) {
|
|
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, LORA_GRAPH_SIZE, false);
|
|
|
|
std::set<std::string> applied_lora_tensors;
|
|
for (auto it : model_tensors) {
|
|
std::string k_tensor = it.first;
|
|
struct ggml_tensor* weight = model_tensors[it.first];
|
|
|
|
size_t k_pos = k_tensor.find(".weight");
|
|
if (k_pos == std::string::npos) {
|
|
continue;
|
|
}
|
|
k_tensor = k_tensor.substr(0, k_pos);
|
|
replace_all_chars(k_tensor, '.', '_');
|
|
// LOG_DEBUG("k_tensor %s", k_tensor.c_str());
|
|
if (k_tensor == "model_diffusion_model_output_blocks_2_2_conv") { // fix for SDXL
|
|
k_tensor = "model_diffusion_model_output_blocks_2_1_conv";
|
|
}
|
|
std::string lora_up_name = "lora." + k_tensor + ".lora_up.weight";
|
|
std::string lora_down_name = "lora." + k_tensor + ".lora_down.weight";
|
|
std::string alpha_name = "lora." + k_tensor + ".alpha";
|
|
std::string scale_name = "lora." + k_tensor + ".scale";
|
|
|
|
ggml_tensor* lora_up = NULL;
|
|
ggml_tensor* lora_down = NULL;
|
|
|
|
if (lora_tensors.find(lora_up_name) != lora_tensors.end()) {
|
|
lora_up = lora_tensors[lora_up_name];
|
|
}
|
|
|
|
if (lora_tensors.find(lora_down_name) != lora_tensors.end()) {
|
|
lora_down = lora_tensors[lora_down_name];
|
|
}
|
|
|
|
if (lora_up == NULL || lora_down == NULL) {
|
|
continue;
|
|
}
|
|
|
|
applied_lora_tensors.insert(lora_up_name);
|
|
applied_lora_tensors.insert(lora_down_name);
|
|
applied_lora_tensors.insert(alpha_name);
|
|
applied_lora_tensors.insert(scale_name);
|
|
|
|
// calc_cale
|
|
int64_t dim = lora_down->ne[ggml_n_dims(lora_down) - 1];
|
|
float scale_value = 1.0f;
|
|
if (lora_tensors.find(scale_name) != lora_tensors.end()) {
|
|
scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
|
|
} else if (lora_tensors.find(alpha_name) != lora_tensors.end()) {
|
|
float alpha = ggml_backend_tensor_get_f32(lora_tensors[alpha_name]);
|
|
scale_value = alpha / dim;
|
|
}
|
|
scale_value *= multiplier;
|
|
|
|
// flat lora tensors to multiply it
|
|
int64_t lora_up_rows = lora_up->ne[ggml_n_dims(lora_up) - 1];
|
|
lora_up = ggml_reshape_2d(compute_ctx, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
|
|
int64_t lora_down_rows = lora_down->ne[ggml_n_dims(lora_down) - 1];
|
|
lora_down = ggml_reshape_2d(compute_ctx, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
|
|
|
|
// ggml_mul_mat requires tensor b transposed
|
|
lora_down = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, lora_down));
|
|
struct ggml_tensor* updown = ggml_mul_mat(compute_ctx, lora_up, lora_down);
|
|
updown = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, updown));
|
|
updown = ggml_reshape(compute_ctx, updown, weight);
|
|
GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
|
|
updown = ggml_scale_inplace(compute_ctx, updown, scale_value);
|
|
ggml_tensor* final_weight;
|
|
// if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
|
|
// final_weight = ggml_new_tensor(compute_ctx, GGML_TYPE_F32, weight->n_dims, weight->ne);
|
|
// final_weight = ggml_cpy_inplace(compute_ctx, weight, final_weight);
|
|
// final_weight = ggml_add_inplace(compute_ctx, final_weight, updown);
|
|
// final_weight = ggml_cpy_inplace(compute_ctx, final_weight, weight);
|
|
// } else {
|
|
// final_weight = ggml_add_inplace(compute_ctx, weight, updown);
|
|
// }
|
|
final_weight = ggml_add_inplace(compute_ctx, weight, updown); // apply directly
|
|
ggml_build_forward_expand(gf, final_weight);
|
|
}
|
|
|
|
for (auto& kv : lora_tensors) {
|
|
if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
|
|
LOG_WARN("unused lora tensor %s", kv.first.c_str());
|
|
}
|
|
}
|
|
|
|
return gf;
|
|
}
|
|
|
|
void apply(std::map<std::string, struct ggml_tensor*> model_tensors, int n_threads) {
|
|
auto get_graph = [&]() -> struct ggml_cgraph* {
|
|
return build_graph(model_tensors);
|
|
};
|
|
GGMLModule::compute(get_graph, n_threads, true);
|
|
}
|
|
};
|
|
|
|
#endif // __LORA_HPP__
|