stable-diffusion.cpp/stable-diffusion.h
bssrdf a469688e30
feat: add TencentARC PhotoMaker support (#179)
* first efforts at implementing photomaker; lots more to do

* added PhotoMakerIDEncoder model in SD

* fixed soem bugs; now photomaker model weights can be loaded into their tensor buffers

* added input id image loading

* added preprocessing inpit id images

* finished get_num_tensors

* fixed a bug in remove_duplicates

* add a get_learned_condition_with_trigger function to do photomaker stuff

* add a convert_token_to_id function for photomaker to extract trigger word's token id

* making progress; need to implement tokenizer decoder

* making more progress; finishing vision model forward

* debugging vision_model outputs

* corrected clip vision model output

* continue making progress in id fusion process

* finished stacked id embedding; to be tested

* remove garbage file

* debuging graph compute

* more progress; now alloc buffer failed

* fixed wtype issue; input images can only be 1 because issue with transformer when batch size > 1 (to be investigated)

* added delayed subject conditioning; now photomaker runs and generates images

* fixed stat_merge_step

* added photomaker lora model (to be tested)

* reworked pmid lora

* finished applying pmid lora; to be tested

* finalized pmid lora

* add a few print tensor; tweak in sample again

* small tweak; still not getting ID faces

* fixed a bug in FuseBlock forward; also remove diag_mask op in for vision transformer; getting better results

* disable pmid lora apply for now; 1 input image seems working; > 1 not working

* turn pmid lora apply back on

* fixed a decode bug

* fixed a bug in ggml's conv_2d, and now > 1 input images working

* add style_ratio as a cli param; reworked encode with trigger for attention weights

* merge commit fixing lora free param buffer error

* change default style ratio to 10%

* added an option to offload vae decoder to CPU for mem-limited gpus

* removing image normalization step seems making ID fidelity much higher

* revert default style ratio back ro 20%

* added an option for normalizing input ID images; cleaned up debugging code

* more clean up

* fixed bugs; now failed with cuda error; likely out-of-mem on GPU

* free pmid model params when required

* photomaker working properly now after merging and adapting to GGMLBlock API

* remove tensor renaming;  fixing names in the photomaker model file

* updated README.md to include instructions and notes for running PhotoMaker

* a bit clean up

* remove -DGGML_CUDA_FORCE_MMQ; more clean up and README update

* add input image requirement in README

* bring back freeing pmid lora params buffer; simply pooled output of CLIPvision

* remove MultiheadAttention2; customized MultiheadAttention

* added a WIN32 get_files_from_dir; turn off Photomakder if receiving no input images

* update docs

* fix ci error

* make stable-diffusion.h a pure c header file

This reverts commit 27887b630db6a92f269f0aef8de9bc9832ab50a9.

* fix ci error

* format code

* reuse get_learned_condition

* reuse pad_tokens

* reuse CLIPVisionModel

* reuse LoraModel

* add --clip-on-cpu

* fix lora name conversion for SDXL

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
Co-authored-by: leejet <leejet714@gmail.com>
2024-03-12 23:15:17 +08:00

204 lines
6.0 KiB
C

#ifndef __STABLE_DIFFUSION_H__
#define __STABLE_DIFFUSION_H__
#if defined(_WIN32) || defined(__CYGWIN__)
#ifndef SD_BUILD_SHARED_LIB
#define SD_API
#else
#ifdef SD_BUILD_DLL
#define SD_API __declspec(dllexport)
#else
#define SD_API __declspec(dllimport)
#endif
#endif
#else
#if __GNUC__ >= 4
#define SD_API __attribute__((visibility("default")))
#else
#define SD_API
#endif
#endif
#ifdef __cplusplus
extern "C" {
#endif
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
enum rng_type_t {
STD_DEFAULT_RNG,
CUDA_RNG
};
enum sample_method_t {
EULER_A,
EULER,
HEUN,
DPM2,
DPMPP2S_A,
DPMPP2M,
DPMPP2Mv2,
LCM,
N_SAMPLE_METHODS
};
enum schedule_t {
DEFAULT,
DISCRETE,
KARRAS,
N_SCHEDULES
};
// same as enum ggml_type
enum sd_type_t {
SD_TYPE_F32 = 0,
SD_TYPE_F16 = 1,
SD_TYPE_Q4_0 = 2,
SD_TYPE_Q4_1 = 3,
// SD_TYPE_Q4_2 = 4, support has been removed
// SD_TYPE_Q4_3 (5) support has been removed
SD_TYPE_Q5_0 = 6,
SD_TYPE_Q5_1 = 7,
SD_TYPE_Q8_0 = 8,
SD_TYPE_Q8_1 = 9,
// k-quantizations
SD_TYPE_Q2_K = 10,
SD_TYPE_Q3_K = 11,
SD_TYPE_Q4_K = 12,
SD_TYPE_Q5_K = 13,
SD_TYPE_Q6_K = 14,
SD_TYPE_Q8_K = 15,
SD_TYPE_IQ2_XXS = 16,
SD_TYPE_IQ2_XS = 17,
SD_TYPE_IQ3_XXS = 18,
SD_TYPE_IQ1_S = 19,
SD_TYPE_IQ4_NL = 20,
SD_TYPE_IQ3_S = 21,
SD_TYPE_IQ2_S = 22,
SD_TYPE_IQ4_XS = 23,
SD_TYPE_I8,
SD_TYPE_I16,
SD_TYPE_I32,
SD_TYPE_COUNT,
};
SD_API const char* sd_type_name(enum sd_type_t type);
enum sd_log_level_t {
SD_LOG_DEBUG,
SD_LOG_INFO,
SD_LOG_WARN,
SD_LOG_ERROR
};
typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
SD_API int32_t get_num_physical_cores();
SD_API const char* sd_get_system_info();
typedef struct {
uint32_t width;
uint32_t height;
uint32_t channel;
uint8_t* data;
} sd_image_t;
typedef struct sd_ctx_t sd_ctx_t;
SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
const char* vae_path,
const char* taesd_path,
const char* control_net_path_c_str,
const char* lora_model_dir,
const char* embed_dir_c_str,
const char* stacked_id_embed_dir_c_str,
bool vae_decode_only,
bool vae_tiling,
bool free_params_immediately,
int n_threads,
enum sd_type_t wtype,
enum rng_type_t rng_type,
enum schedule_t s,
bool keep_clip_on_cpu,
bool keep_control_net_cpu,
bool keep_vae_on_cpu);
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
const char* prompt,
const char* negative_prompt,
int clip_skip,
float cfg_scale,
int width,
int height,
enum sample_method_t sample_method,
int sample_steps,
int64_t seed,
int batch_count,
const sd_image_t* control_cond,
float control_strength,
float style_strength,
bool normalize_input,
const char* input_id_images_path);
SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
sd_image_t init_image,
const char* prompt,
const char* negative_prompt,
int clip_skip,
float cfg_scale,
int width,
int height,
enum sample_method_t sample_method,
int sample_steps,
float strength,
int64_t seed,
int batch_count);
SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
sd_image_t init_image,
int width,
int height,
int video_frames,
int motion_bucket_id,
int fps,
float augmentation_level,
float min_cfg,
float cfg_scale,
enum sample_method_t sample_method,
int sample_steps,
float strength,
int64_t seed);
typedef struct upscaler_ctx_t upscaler_ctx_t;
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
int n_threads,
enum sd_type_t wtype);
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);
SD_API bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type);
SD_API uint8_t* preprocess_canny(uint8_t* img,
int width,
int height,
float high_threshold,
float low_threshold,
float weak,
float strong,
bool inverse);
#ifdef __cplusplus
}
#endif
#endif // __STABLE_DIFFUSION_H__