From 4eef4c8fa827cab9c36465039e94ba20c5cb5bf1 Mon Sep 17 00:00:00 2001 From: "Harris M. Snyder" Date: Sat, 7 Jun 2025 21:28:58 -0400 Subject: [PATCH 1/5] refactor to expose c++ internals --- .gitignore | 3 +- CMakeLists.txt | 8 +- stable-diffusion.cpp | 183 +++++++++++++------------------------------ stable-diffusion.hpp | 170 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 234 insertions(+), 130 deletions(-) create mode 100644 stable-diffusion.hpp diff --git a/.gitignore b/.gitignore index 38fe570df..4274effab 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ test/ *.gguf output*.png models* -*.log \ No newline at end of file +*.log +.idea/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 782a893e4..6d7d30c35 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -117,7 +117,13 @@ add_definitions(-DGGML_MAX_NAME=128) # deps # Only add ggml if it hasn't been added yet if (NOT TARGET ggml) - add_subdirectory(ggml) + include(FetchContent) + FetchContent_Declare( + ggml + GIT_REPOSITORY https://github.com/ggerganov/ggml.git + GIT_TAG ff9052988b76e137bcf92bb335733933ca196ac0 + ) + FetchContent_MakeAvailable(ggml) endif() add_subdirectory(thirdparty) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e38a6101f..d8a9fd10c 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1,25 +1,12 @@ -#include "ggml_extend.hpp" - -#include "model.h" -#include "rng.hpp" -#include "rng_philox.hpp" -#include "stable-diffusion.h" -#include "util.h" - -#include "conditioner.hpp" -#include "control.hpp" -#include "denoiser.hpp" -#include "diffusion_model.hpp" -#include "esrgan.hpp" -#include "lora.hpp" -#include "pmid.hpp" -#include "tae.hpp" -#include "vae.hpp" +#include "stable-diffusion.hpp" + #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_STATIC #include "stb_image.h" + + // #define STB_IMAGE_WRITE_IMPLEMENTATION // #define STB_IMAGE_WRITE_STATIC // #include "stb_image_write.h" @@ -51,99 +38,36 @@ const char* sampling_methods_str[] = { "TCD" }; -/*================================================== Helper Functions ================================================*/ - -void calculate_alphas_cumprod(float* alphas_cumprod, - float linear_start = 0.00085f, - float linear_end = 0.0120, - int timesteps = TIMESTEPS) { - float ls_sqrt = sqrtf(linear_start); - float le_sqrt = sqrtf(linear_end); - float amount = le_sqrt - ls_sqrt; - float product = 1.0f; - for (int i = 0; i < timesteps; i++) { - float beta = ls_sqrt + amount * ((float)i / (timesteps - 1)); - product *= 1.0f - powf(beta, 2.0f); - alphas_cumprod[i] = product; +StableDiffusionGGML::StableDiffusionGGML(int n_threads, + bool vae_decode_only, + bool free_params_immediately, + std::string lora_model_dir, + rng_type_t rng_type) + : n_threads(n_threads), + vae_decode_only(vae_decode_only), + free_params_immediately(free_params_immediately), + lora_model_dir(lora_model_dir) { + if (rng_type == STD_DEFAULT_RNG) { + rng = std::make_shared(); + } else if (rng_type == CUDA_RNG) { + rng = std::make_shared(); } } -/*=============================================== StableDiffusionGGML ================================================*/ - -class StableDiffusionGGML { -public: - ggml_backend_t backend = NULL; // general backend - ggml_backend_t clip_backend = NULL; - ggml_backend_t control_net_backend = NULL; - ggml_backend_t vae_backend = NULL; - ggml_type model_wtype = GGML_TYPE_COUNT; - ggml_type conditioner_wtype = GGML_TYPE_COUNT; - ggml_type diffusion_model_wtype = GGML_TYPE_COUNT; - ggml_type vae_wtype = GGML_TYPE_COUNT; - - SDVersion version; - bool vae_decode_only = false; - bool free_params_immediately = false; - - std::shared_ptr rng = std::make_shared(); - int n_threads = -1; - float scale_factor = 0.18215f; - - std::shared_ptr cond_stage_model; - std::shared_ptr clip_vision; // for svd - std::shared_ptr diffusion_model; - std::shared_ptr first_stage_model; - std::shared_ptr tae_first_stage; - std::shared_ptr control_net; - std::shared_ptr pmid_model; - std::shared_ptr pmid_lora; - std::shared_ptr pmid_id_embeds; - - std::string taesd_path; - bool use_tiny_autoencoder = false; - bool vae_tiling = false; - bool stacked_id = false; - - std::map tensors; - - std::string lora_model_dir; - // lora_name => multiplier - std::unordered_map curr_lora_state; - - std::shared_ptr denoiser = std::make_shared(); - - StableDiffusionGGML() = default; - - StableDiffusionGGML(int n_threads, - bool vae_decode_only, - bool free_params_immediately, - std::string lora_model_dir, - rng_type_t rng_type) - : n_threads(n_threads), - vae_decode_only(vae_decode_only), - free_params_immediately(free_params_immediately), - lora_model_dir(lora_model_dir) { - if (rng_type == STD_DEFAULT_RNG) { - rng = std::make_shared(); - } else if (rng_type == CUDA_RNG) { - rng = std::make_shared(); - } +StableDiffusionGGML::~StableDiffusionGGML() { + if (clip_backend != backend) { + ggml_backend_free(clip_backend); } - - ~StableDiffusionGGML() { - if (clip_backend != backend) { - ggml_backend_free(clip_backend); - } - if (control_net_backend != backend) { - ggml_backend_free(control_net_backend); - } - if (vae_backend != backend) { - ggml_backend_free(vae_backend); - } - ggml_backend_free(backend); + if (control_net_backend != backend) { + ggml_backend_free(control_net_backend); } + if (vae_backend != backend) { + ggml_backend_free(vae_backend); + } + ggml_backend_free(backend); +} - bool load_from_file(const std::string& model_path, +bool StableDiffusionGGML::load_from_file(const std::string& model_path, const std::string& clip_l_path, const std::string& clip_g_path, const std::string& t5xxl_path, @@ -602,7 +526,7 @@ class StableDiffusionGGML { return true; } - bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) { + bool StableDiffusionGGML::is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint) { struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); ggml_set_f32(x_t, 0.5); struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); @@ -638,7 +562,7 @@ class StableDiffusionGGML { return result < -1; } - void apply_lora(const std::string& lora_name, float multiplier) { + void StableDiffusionGGML::apply_lora(const std::string& lora_name, float multiplier) { int64_t t0 = ggml_time_ms(); std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt"); @@ -667,7 +591,7 @@ class StableDiffusionGGML { LOG_INFO("lora '%s' applied, taking %.2fs", lora_name.c_str(), (t1 - t0) * 1.0f / 1000); } - void apply_loras(const std::unordered_map& lora_state) { + void StableDiffusionGGML::apply_loras(const std::unordered_map& lora_state) { if (lora_state.size() > 0 && model_wtype != GGML_TYPE_F16 && model_wtype != GGML_TYPE_F32) { LOG_WARN("In quantized models when applying LoRA, the images have poor quality."); } @@ -682,7 +606,7 @@ class StableDiffusionGGML { float curr_multiplier = kv.second; lora_state_diff[lora_name] -= curr_multiplier; } - + size_t rm = lora_state_diff.size() - lora_state.size(); if (rm != 0) { LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm); @@ -697,7 +621,7 @@ class StableDiffusionGGML { curr_lora_state = lora_state; } - ggml_tensor* id_encoder(ggml_context* work_ctx, + ggml_tensor* StableDiffusionGGML::id_encoder(ggml_context* work_ctx, ggml_tensor* init_img, ggml_tensor* prompts_embeds, ggml_tensor* id_embeds, @@ -707,14 +631,14 @@ class StableDiffusionGGML { return res; } - SDCondition get_svd_condition(ggml_context* work_ctx, + SDCondition StableDiffusionGGML::get_svd_condition(ggml_context* work_ctx, sd_image_t init_image, int width, int height, - int fps = 6, - int motion_bucket_id = 127, - float augmentation_level = 0.f, - bool force_zero_embeddings = false) { + int fps , + int motion_bucket_id , + float augmentation_level , + bool force_zero_embeddings ) { // c_crossattn int64_t t0 = ggml_time_ms(); struct ggml_tensor* c_crossattn = NULL; @@ -785,7 +709,7 @@ class StableDiffusionGGML { return {c_crossattn, y, c_concat}; } - ggml_tensor* sample(ggml_context* work_ctx, + ggml_tensor* StableDiffusionGGML::sample(ggml_context* work_ctx, ggml_tensor* init_latent, ggml_tensor* noise, SDCondition cond, @@ -800,11 +724,11 @@ class StableDiffusionGGML { const std::vector& sigmas, int start_merge_step, SDCondition id_cond, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr) { + std::vector skip_layers , + float slg_scale , + float skip_layer_start , + float skip_layer_end , + ggml_tensor* noise_mask ) { LOG_DEBUG("Sample"); struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); @@ -1004,7 +928,7 @@ class StableDiffusionGGML { } // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding - ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { + ggml_tensor* StableDiffusionGGML::get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); @@ -1035,7 +959,7 @@ class StableDiffusionGGML { return latent; } - ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) { + ggml_tensor* StableDiffusionGGML::compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) { int64_t W = x->ne[0]; int64_t H = x->ne[1]; int64_t C = 8; @@ -1094,20 +1018,22 @@ class StableDiffusionGGML { return result; } - ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { + ggml_tensor* StableDiffusionGGML::encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { return compute_first_stage(work_ctx, x, false); } - ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { + ggml_tensor* StableDiffusionGGML::decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { return compute_first_stage(work_ctx, x, true); } -}; + + + + + /*================================================= SD API ==================================================*/ -struct sd_ctx_t { - StableDiffusionGGML* sd = NULL; -}; + sd_ctx_t* new_sd_ctx(const char* model_path_c_str, const char* clip_l_path_c_str, @@ -1943,3 +1869,4 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, return result_images; } + diff --git a/stable-diffusion.hpp b/stable-diffusion.hpp new file mode 100644 index 000000000..177dae178 --- /dev/null +++ b/stable-diffusion.hpp @@ -0,0 +1,170 @@ +#ifndef STABLE_DIFFUSION_HPP +#define STABLE_DIFFUSION_HPP +#include "ggml_extend.hpp" + +#include "model.h" +#include "rng.hpp" +#include "rng_philox.hpp" +#include "stable-diffusion.h" +#include "util.h" + +#include "conditioner.hpp" +#include "control.hpp" +#include "denoiser.hpp" +#include "diffusion_model.hpp" +#include "esrgan.hpp" +#include "lora.hpp" +#include "pmid.hpp" +#include "tae.hpp" +#include "vae.hpp" + + + +/*================================================== Helper Functions ================================================*/ + +void calculate_alphas_cumprod(float* alphas_cumprod, + float linear_start = 0.00085f, + float linear_end = 0.0120, + int timesteps = TIMESTEPS) { + float ls_sqrt = sqrtf(linear_start); + float le_sqrt = sqrtf(linear_end); + float amount = le_sqrt - ls_sqrt; + float product = 1.0f; + for (int i = 0; i < timesteps; i++) { + float beta = ls_sqrt + amount * ((float)i / (timesteps - 1)); + product *= 1.0f - powf(beta, 2.0f); + alphas_cumprod[i] = product; + } +} + +/*=============================================== StableDiffusionGGML ================================================*/ + +class StableDiffusionGGML { +public: + ggml_backend_t backend = NULL; // general backend + ggml_backend_t clip_backend = NULL; + ggml_backend_t control_net_backend = NULL; + ggml_backend_t vae_backend = NULL; + ggml_type model_wtype = GGML_TYPE_COUNT; + ggml_type conditioner_wtype = GGML_TYPE_COUNT; + ggml_type diffusion_model_wtype = GGML_TYPE_COUNT; + ggml_type vae_wtype = GGML_TYPE_COUNT; + + SDVersion version; + bool vae_decode_only = false; + bool free_params_immediately = false; + + std::shared_ptr rng = std::make_shared(); + int n_threads = -1; + float scale_factor = 0.18215f; + + std::shared_ptr cond_stage_model; + std::shared_ptr clip_vision; // for svd + std::shared_ptr diffusion_model; + std::shared_ptr first_stage_model; + std::shared_ptr tae_first_stage; + std::shared_ptr control_net; + std::shared_ptr pmid_model; + std::shared_ptr pmid_lora; + std::shared_ptr pmid_id_embeds; + + std::string taesd_path; + bool use_tiny_autoencoder = false; + bool vae_tiling = false; + bool stacked_id = false; + + std::map tensors; + + std::string lora_model_dir; + // lora_name => multiplier + std::unordered_map curr_lora_state; + + std::shared_ptr denoiser = std::make_shared(); + + StableDiffusionGGML() = default; + + StableDiffusionGGML(int n_threads, + bool vae_decode_only, + bool free_params_immediately, + std::string lora_model_dir, + rng_type_t rng_type); + + + ~StableDiffusionGGML() ; + + bool load_from_file(const std::string& model_path, + const std::string& clip_l_path, + const std::string& clip_g_path, + const std::string& t5xxl_path, + const std::string& diffusion_model_path, + const std::string& vae_path, + const std::string control_net_path, + const std::string embeddings_path, + const std::string id_embeddings_path, + const std::string& taesd_path, + bool vae_tiling_, + ggml_type wtype, + schedule_t schedule, + bool clip_on_cpu, + bool control_net_cpu, + bool vae_on_cpu, + bool diffusion_flash_attn); + + bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false); + + void apply_lora(const std::string& lora_name, float multiplier); + + void apply_loras(const std::unordered_map& lora_state); + + ggml_tensor* id_encoder(ggml_context* work_ctx, + ggml_tensor* init_img, + ggml_tensor* prompts_embeds, + ggml_tensor* id_embeds, + std::vector& class_tokens_mask); + + SDCondition get_svd_condition(ggml_context* work_ctx, + sd_image_t init_image, + int width, + int height, + int fps = 6, + int motion_bucket_id = 127, + float augmentation_level = 0.f, + bool force_zero_embeddings = false); + + ggml_tensor* sample(ggml_context* work_ctx, + ggml_tensor* init_latent, + ggml_tensor* noise, + SDCondition cond, + SDCondition uncond, + ggml_tensor* control_hint, + float control_strength, + float min_cfg, + float cfg_scale, + float guidance, + float eta, + sample_method_t method, + const std::vector& sigmas, + int start_merge_step, + SDCondition id_cond, + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* noise_mask = nullptr); + + // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding + ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments); + ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode); + + ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x); + + ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x); +}; + +/*================================================= SD API ==================================================*/ + +struct sd_ctx_t { + StableDiffusionGGML* sd = NULL; +}; + +#endif // STABLE_DIFFUSION_HPP \ No newline at end of file From 105c67d63dcc03cf2a84025d739638dd8c0dbf2c Mon Sep 17 00:00:00 2001 From: "Harris M. Snyder" Date: Sat, 7 Jun 2025 21:30:55 -0400 Subject: [PATCH 2/5] purge submodules --- .gitmodules | 3 --- ggml | 1 - 2 files changed, 4 deletions(-) delete mode 100644 .gitmodules delete mode 160000 ggml diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index d9d943713..000000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "ggml"] - path = ggml - url = https://github.com/ggerganov/ggml.git diff --git a/ggml b/ggml deleted file mode 160000 index ff9052988..000000000 --- a/ggml +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ff9052988b76e137bcf92bb335733933ca196ac0 From d7e7997c5505f689b80c61dd4618f9a9a7b3e143 Mon Sep 17 00:00:00 2001 From: "Harris M. Snyder" Date: Sat, 7 Jun 2025 22:26:12 -0400 Subject: [PATCH 3/5] allow skip unet load --- stable-diffusion.cpp | 11 ++++++++--- stable-diffusion.hpp | 10 +++++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index d8a9fd10c..6fe712fff 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -83,7 +83,8 @@ bool StableDiffusionGGML::load_from_file(const std::string& model_path, bool clip_on_cpu, bool control_net_cpu, bool vae_on_cpu, - bool diffusion_flash_attn) { + bool diffusion_flash_attn, + StableDiffusionLoadConfiguration details) { use_tiny_autoencoder = taesd_path.size() > 0; #ifdef SD_USE_CUDA LOG_DEBUG("Using CUDA backend"); @@ -272,8 +273,12 @@ bool StableDiffusionGGML::load_from_file(const std::string& model_path, cond_stage_model->alloc_params_buffer(); cond_stage_model->get_param_tensors(tensors); - diffusion_model->alloc_params_buffer(); - diffusion_model->get_param_tensors(tensors); + if(!details.skip_unet) { + diffusion_model->alloc_params_buffer(); + diffusion_model->get_param_tensors(tensors); + } + assert(!details.skip_vae);// not implemented yet + assert(!details.skip_text_encoders);// not implemented yet if (!use_tiny_autoencoder) { if (vae_on_cpu && !ggml_backend_is_cpu(backend)) { diff --git a/stable-diffusion.hpp b/stable-diffusion.hpp index 177dae178..f4373a840 100644 --- a/stable-diffusion.hpp +++ b/stable-diffusion.hpp @@ -37,6 +37,13 @@ void calculate_alphas_cumprod(float* alphas_cumprod, } } +struct StableDiffusionLoadConfiguration +{ + bool skip_unet = false; + bool skip_vae = false; + bool skip_text_encoders = false; +}; + /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { @@ -108,7 +115,8 @@ class StableDiffusionGGML { bool clip_on_cpu, bool control_net_cpu, bool vae_on_cpu, - bool diffusion_flash_attn); + bool diffusion_flash_attn, + StableDiffusionLoadConfiguration details = StableDiffusionLoadConfiguration()); bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false); From 8a63b34694d36cfb6999f5ca95b3593556d41be7 Mon Sep 17 00:00:00 2001 From: "Harris M. Snyder" Date: Sun, 8 Jun 2025 15:36:52 -0400 Subject: [PATCH 4/5] factor out vae inference functions --- stable-diffusion.cpp | 907 ++++++++++++++++++++++--------------------- stable-diffusion.hpp | 15 + 2 files changed, 476 insertions(+), 446 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 6fe712fff..1f124849a 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -6,6 +6,93 @@ #include "stb_image.h" +// ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding +ggml_tensor* vae_sample(ggml_context* work_ctx, ggml_tensor* moments, float scale_factor, std::shared_ptr rng) +{ + // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample + ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); + struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); + ggml_tensor_set_f32_randn(noise, rng); + // noise = load_tensor_from_file(work_ctx, "noise.bin"); + { + float mean = 0; + float logvar = 0; + float value = 0; + float std_ = 0; + for (int i = 0; i < latent->ne[3]; i++) { + for (int j = 0; j < latent->ne[2]; j++) { + for (int k = 0; k < latent->ne[1]; k++) { + for (int l = 0; l < latent->ne[0]; l++) { + mean = ggml_tensor_get_f32(moments, l, k, j, i); + logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i); + logvar = std::max(-30.0f, std::min(logvar, 20.0f)); + std_ = std::exp(0.5f * logvar); + value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); + value = value * scale_factor; + // printf("%d %d %d %d -> %f\n", i, j, k, l, value); + ggml_tensor_set_f32(latent, value, l, k, j, i); + } + } + } + } + } + return latent; +} + +ggml_tensor* vae_run( + AutoEncoderKL *first_stage_model, + ggml_context* work_ctx, + ggml_tensor* x, + bool decode, + SDVersion version, + float scale_factor, + bool vae_tiling, + int n_threads) +{ + // I ripped out the tiny autoencoder code paths from here... sorry + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + int64_t C = 8; + + if (sd_version_is_sd3(version)) { + C = 32; + } else if (sd_version_is_flux(version)) { + C = 32; + } + + ggml_tensor* result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, + decode ? (W * 8) : (W / 8), // width + decode ? (H * 8) : (H / 8), // height + decode ? 3 : C, + x->ne[3]); // channels + int64_t t0 = ggml_time_ms(); + if (decode) { + ggml_tensor_scale(x, 1.0f / scale_factor); + } else { + ggml_tensor_scale_input(x); + } + if (vae_tiling && decode) { // TODO: support tiling vae encode + // split latent in 32x32 tiles and compute in several steps + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + first_stage_model->compute(n_threads, in, decode, &out); + }; + sd_tiling(x, result, 8, 32, 0.5f, on_tiling); + } else { + first_stage_model->compute(n_threads, x, decode, &result); + } + first_stage_model->free_compute_buffer(); + if (decode) { + ggml_tensor_scale_output(result); + } + + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000); + if (decode) { + ggml_tensor_clamp(result, 0.0f, 1.0f); + } + return result; +} + // #define STB_IMAGE_WRITE_IMPLEMENTATION // #define STB_IMAGE_WRITE_STATIC @@ -531,505 +618,433 @@ bool StableDiffusionGGML::load_from_file(const std::string& model_path, return true; } - bool StableDiffusionGGML::is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint) { - struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); - ggml_set_f32(x_t, 0.5); - struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); - ggml_set_f32(c, 0.5); +bool StableDiffusionGGML::is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint) { + struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); + ggml_set_f32(x_t, 0.5); + struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); + ggml_set_f32(c, 0.5); - struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); - ggml_set_f32(timesteps, 999); + struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); + ggml_set_f32(timesteps, 999); - struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL; - if (concat != NULL) { - ggml_set_f32(concat, 0); - } + struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL; + if (concat != NULL) { + ggml_set_f32(concat, 0); + } - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); - diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out); - diffusion_model->free_compute_buffer(); + int64_t t0 = ggml_time_ms(); + struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); + diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out); + diffusion_model->free_compute_buffer(); - double result = 0.f; - { - float* vec_x = (float*)x_t->data; - float* vec_out = (float*)out->data; + double result = 0.f; + { + float* vec_x = (float*)x_t->data; + float* vec_out = (float*)out->data; - int64_t n = ggml_nelements(out); + int64_t n = ggml_nelements(out); - for (int i = 0; i < n; i++) { - result += ((double)vec_out[i] - (double)vec_x[i]); - } - result /= n; - } - int64_t t1 = ggml_time_ms(); - LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000); - return result < -1; - } - - void StableDiffusionGGML::apply_lora(const std::string& lora_name, float multiplier) { - int64_t t0 = ggml_time_ms(); - std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); - std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt"); - std::string file_path; - if (file_exists(st_file_path)) { - file_path = st_file_path; - } else if (file_exists(ckpt_file_path)) { - file_path = ckpt_file_path; - } else { - LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str()); - return; - } - LoraModel lora(backend, file_path); - if (!lora.load_from_file()) { - LOG_WARN("load lora tensors from %s failed", file_path.c_str()); - return; + for (int i = 0; i < n; i++) { + result += ((double)vec_out[i] - (double)vec_x[i]); } + result /= n; + } + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000); + return result < -1; +} - lora.multiplier = multiplier; - // TODO: send version? - lora.apply(tensors, version, n_threads); - lora.free_params_buffer(); +void StableDiffusionGGML::apply_lora(const std::string& lora_name, float multiplier) { + int64_t t0 = ggml_time_ms(); + std::string st_file_path = path_join(lora_model_dir, lora_name + ".safetensors"); + std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt"); + std::string file_path; + if (file_exists(st_file_path)) { + file_path = st_file_path; + } else if (file_exists(ckpt_file_path)) { + file_path = ckpt_file_path; + } else { + LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str()); + return; + } + LoraModel lora(backend, file_path); + if (!lora.load_from_file()) { + LOG_WARN("load lora tensors from %s failed", file_path.c_str()); + return; + } - int64_t t1 = ggml_time_ms(); + lora.multiplier = multiplier; + // TODO: send version? + lora.apply(tensors, version, n_threads); + lora.free_params_buffer(); + + int64_t t1 = ggml_time_ms(); - LOG_INFO("lora '%s' applied, taking %.2fs", lora_name.c_str(), (t1 - t0) * 1.0f / 1000); + LOG_INFO("lora '%s' applied, taking %.2fs", lora_name.c_str(), (t1 - t0) * 1.0f / 1000); +} + +void StableDiffusionGGML::apply_loras(const std::unordered_map& lora_state) { + if (lora_state.size() > 0 && model_wtype != GGML_TYPE_F16 && model_wtype != GGML_TYPE_F32) { + LOG_WARN("In quantized models when applying LoRA, the images have poor quality."); + } + std::unordered_map lora_state_diff; + for (auto& kv : lora_state) { + const std::string& lora_name = kv.first; + float multiplier = kv.second; + lora_state_diff[lora_name] += multiplier; + } + for (auto& kv : curr_lora_state) { + const std::string& lora_name = kv.first; + float curr_multiplier = kv.second; + lora_state_diff[lora_name] -= curr_multiplier; } - void StableDiffusionGGML::apply_loras(const std::unordered_map& lora_state) { - if (lora_state.size() > 0 && model_wtype != GGML_TYPE_F16 && model_wtype != GGML_TYPE_F32) { - LOG_WARN("In quantized models when applying LoRA, the images have poor quality."); - } - std::unordered_map lora_state_diff; - for (auto& kv : lora_state) { - const std::string& lora_name = kv.first; - float multiplier = kv.second; - lora_state_diff[lora_name] += multiplier; - } - for (auto& kv : curr_lora_state) { - const std::string& lora_name = kv.first; - float curr_multiplier = kv.second; - lora_state_diff[lora_name] -= curr_multiplier; - } + size_t rm = lora_state_diff.size() - lora_state.size(); + if (rm != 0) { + LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm); + } else { + LOG_INFO("Attempting to apply %lu LoRAs", lora_state.size()); + } + + for (auto& kv : lora_state_diff) { + apply_lora(kv.first, kv.second); + } + + curr_lora_state = lora_state; +} + +ggml_tensor* StableDiffusionGGML::id_encoder(ggml_context* work_ctx, + ggml_tensor* init_img, + ggml_tensor* prompts_embeds, + ggml_tensor* id_embeds, + std::vector& class_tokens_mask) { + ggml_tensor* res = NULL; + pmid_model->compute(n_threads, init_img, prompts_embeds, id_embeds, class_tokens_mask, &res, work_ctx); + return res; +} - size_t rm = lora_state_diff.size() - lora_state.size(); - if (rm != 0) { - LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm); +SDCondition StableDiffusionGGML::get_svd_condition(ggml_context* work_ctx, + sd_image_t init_image, + int width, + int height, + int fps , + int motion_bucket_id , + float augmentation_level , + bool force_zero_embeddings ) { + // c_crossattn + int64_t t0 = ggml_time_ms(); + struct ggml_tensor* c_crossattn = NULL; + { + if (force_zero_embeddings) { + c_crossattn = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, clip_vision->vision_model.projection_dim); + ggml_set_f32(c_crossattn, 0.f); } else { - LOG_INFO("Attempting to apply %lu LoRAs", lora_state.size()); + sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); + sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size); + free(image.data); + image.data = NULL; + + ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1); + sd_image_f32_to_tensor(resized_image.data, pixel_values, false); + free(resized_image.data); + resized_image.data = NULL; + + // print_ggml_tensor(pixel_values); + clip_vision->compute(n_threads, pixel_values, &c_crossattn, work_ctx); + // print_ggml_tensor(c_crossattn); } + } - for (auto& kv : lora_state_diff) { - apply_lora(kv.first, kv.second); - } + // c_concat + struct ggml_tensor* c_concat = NULL; + { + if (force_zero_embeddings) { + c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 4, 1); + ggml_set_f32(c_concat, 0.f); + } else { + ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - curr_lora_state = lora_state; - } - - ggml_tensor* StableDiffusionGGML::id_encoder(ggml_context* work_ctx, - ggml_tensor* init_img, - ggml_tensor* prompts_embeds, - ggml_tensor* id_embeds, - std::vector& class_tokens_mask) { - ggml_tensor* res = NULL; - pmid_model->compute(n_threads, init_img, prompts_embeds, id_embeds, class_tokens_mask, &res, work_ctx); - return res; - } - - SDCondition StableDiffusionGGML::get_svd_condition(ggml_context* work_ctx, - sd_image_t init_image, - int width, - int height, - int fps , - int motion_bucket_id , - float augmentation_level , - bool force_zero_embeddings ) { - // c_crossattn - int64_t t0 = ggml_time_ms(); - struct ggml_tensor* c_crossattn = NULL; - { - if (force_zero_embeddings) { - c_crossattn = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, clip_vision->vision_model.projection_dim); - ggml_set_f32(c_crossattn, 0.f); - } else { + if (width != init_image.width || height != init_image.height) { sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); - sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size); + sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height); free(image.data); image.data = NULL; - - ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1); - sd_image_f32_to_tensor(resized_image.data, pixel_values, false); + sd_image_f32_to_tensor(resized_image.data, init_img, false); free(resized_image.data); resized_image.data = NULL; - - // print_ggml_tensor(pixel_values); - clip_vision->compute(n_threads, pixel_values, &c_crossattn, work_ctx); - // print_ggml_tensor(c_crossattn); - } - } - - // c_concat - struct ggml_tensor* c_concat = NULL; - { - if (force_zero_embeddings) { - c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 4, 1); - ggml_set_f32(c_concat, 0.f); } else { - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - - if (width != init_image.width || height != init_image.height) { - sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(init_image); - sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height); - free(image.data); - image.data = NULL; - sd_image_f32_to_tensor(resized_image.data, init_img, false); - free(resized_image.data); - resized_image.data = NULL; - } else { - sd_image_to_tensor(init_image.data, init_img); - } - if (augmentation_level > 0.f) { - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img); - ggml_tensor_set_f32_randn(noise, rng); - // encode_pixels += torch.randn_like(pixels) * augmentation_level - ggml_tensor_scale(noise, augmentation_level); - ggml_tensor_add(init_img, noise); - } - ggml_tensor* moments = encode_first_stage(work_ctx, init_img); - c_concat = get_first_stage_encoding(work_ctx, moments); + sd_image_to_tensor(init_image.data, init_img); } + if (augmentation_level > 0.f) { + struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img); + ggml_tensor_set_f32_randn(noise, rng); + // encode_pixels += torch.randn_like(pixels) * augmentation_level + ggml_tensor_scale(noise, augmentation_level); + ggml_tensor_add(init_img, noise); + } + ggml_tensor* moments = encode_first_stage(work_ctx, init_img); + c_concat = get_first_stage_encoding(work_ctx, moments); } + } - // y - struct ggml_tensor* y = NULL; - { - y = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels()); - int out_dim = 256; - int fps_id = fps - 1; - std::vector timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level}; - set_timestep_embedding(timesteps, y, out_dim); - } - int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0); - return {c_crossattn, y, c_concat}; - } - - ggml_tensor* StableDiffusionGGML::sample(ggml_context* work_ctx, - ggml_tensor* init_latent, - ggml_tensor* noise, - SDCondition cond, - SDCondition uncond, - ggml_tensor* control_hint, - float control_strength, - float min_cfg, - float cfg_scale, - float guidance, - float eta, - sample_method_t method, - const std::vector& sigmas, - int start_merge_step, - SDCondition id_cond, - std::vector skip_layers , - float slg_scale , - float skip_layer_start , - float skip_layer_end , - ggml_tensor* noise_mask ) { - LOG_DEBUG("Sample"); - struct ggml_init_params params; - size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); - for (int i = 1; i < 4; i++) { - data_size *= init_latent->ne[i]; - } - data_size += 1024; - params.mem_size = data_size * 3; - params.mem_buffer = NULL; - params.no_alloc = false; - ggml_context* tmp_ctx = ggml_init(params); - - size_t steps = sigmas.size() - 1; - // noise = load_tensor_from_file(work_ctx, "./rand0.bin"); - // print_ggml_tensor(noise); - struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent); - copy_ggml_tensor(x, init_latent); - x = denoiser->noise_scaling(sigmas[0], noise, x); - - struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise); - - bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL; - bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; + // y + struct ggml_tensor* y = NULL; + { + y = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels()); + int out_dim = 256; + int fps_id = fps - 1; + std::vector timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level}; + set_timestep_embedding(timesteps, y, out_dim); + } + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0); + return {c_crossattn, y, c_concat}; +} - // denoise wrapper - struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); - struct ggml_tensor* out_uncond = NULL; - struct ggml_tensor* out_skip = NULL; +ggml_tensor* StableDiffusionGGML::sample(ggml_context* work_ctx, + ggml_tensor* init_latent, + ggml_tensor* noise, + SDCondition cond, + SDCondition uncond, + ggml_tensor* control_hint, + float control_strength, + float min_cfg, + float cfg_scale, + float guidance, + float eta, + sample_method_t method, + const std::vector& sigmas, + int start_merge_step, + SDCondition id_cond, + std::vector skip_layers , + float slg_scale , + float skip_layer_start , + float skip_layer_end , + ggml_tensor* noise_mask ) { + LOG_DEBUG("Sample"); + struct ggml_init_params params; + size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); + for (int i = 1; i < 4; i++) { + data_size *= init_latent->ne[i]; + } + data_size += 1024; + params.mem_size = data_size * 3; + params.mem_buffer = NULL; + params.no_alloc = false; + ggml_context* tmp_ctx = ggml_init(params); + + size_t steps = sigmas.size() - 1; + // noise = load_tensor_from_file(work_ctx, "./rand0.bin"); + // print_ggml_tensor(noise); + struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent); + copy_ggml_tensor(x, init_latent); + x = denoiser->noise_scaling(sigmas[0], noise, x); + + struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise); + + bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL; + bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; + + // denoise wrapper + struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* out_uncond = NULL; + struct ggml_tensor* out_skip = NULL; + + if (has_unconditioned) { + out_uncond = ggml_dup_tensor(work_ctx, x); + } + if (has_skiplayer) { + if (sd_version_is_dit(version)) { + out_skip = ggml_dup_tensor(work_ctx, x); + } else { + has_skiplayer = false; + LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); + } + } + struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); - if (has_unconditioned) { - out_uncond = ggml_dup_tensor(work_ctx, x); + auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { + if (step == 1) { + pretty_progress(0, (int)steps, 0); } - if (has_skiplayer) { - if (sd_version_is_dit(version)) { - out_skip = ggml_dup_tensor(work_ctx, x); - } else { - has_skiplayer = false; - LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); - } + int64_t t0 = ggml_time_us(); + + std::vector scaling = denoiser->get_scalings(sigma); + GGML_ASSERT(scaling.size() == 3); + float c_skip = scaling[0]; + float c_out = scaling[1]; + float c_in = scaling[2]; + + float t = denoiser->sigma_to_t(sigma); + std::vector timesteps_vec(x->ne[3], t); // [N, ] + auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); + std::vector guidance_vec(x->ne[3], guidance); + auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); + + copy_ggml_tensor(noised_input, input); + // noised_input = noised_input * c_in + ggml_tensor_scale(noised_input, c_in); + + std::vector controls; + + if (control_hint != NULL) { + control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector); + controls = control_net->controls; + // print_ggml_tensor(controls[12]); + // GGML_ASSERT(0); } - struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); - - auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { - if (step == 1) { - pretty_progress(0, (int)steps, 0); - } - int64_t t0 = ggml_time_us(); - - std::vector scaling = denoiser->get_scalings(sigma); - GGML_ASSERT(scaling.size() == 3); - float c_skip = scaling[0]; - float c_out = scaling[1]; - float c_in = scaling[2]; - float t = denoiser->sigma_to_t(sigma); - std::vector timesteps_vec(x->ne[3], t); // [N, ] - auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); - std::vector guidance_vec(x->ne[3], guidance); - auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); - - copy_ggml_tensor(noised_input, input); - // noised_input = noised_input * c_in - ggml_tensor_scale(noised_input, c_in); - - std::vector controls; + if (start_merge_step == -1 || step <= start_merge_step) { + // cond + diffusion_model->compute(n_threads, + noised_input, + timesteps, + cond.c_crossattn, + cond.c_concat, + cond.c_vector, + guidance_tensor, + -1, + controls, + control_strength, + &out_cond); + } else { + diffusion_model->compute(n_threads, + noised_input, + timesteps, + id_cond.c_crossattn, + cond.c_concat, + id_cond.c_vector, + guidance_tensor, + -1, + controls, + control_strength, + &out_cond); + } + float* negative_data = NULL; + if (has_unconditioned) { + // uncond if (control_hint != NULL) { - control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector); + control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector); controls = control_net->controls; - // print_ggml_tensor(controls[12]); - // GGML_ASSERT(0); - } - - if (start_merge_step == -1 || step <= start_merge_step) { - // cond - diffusion_model->compute(n_threads, - noised_input, - timesteps, - cond.c_crossattn, - cond.c_concat, - cond.c_vector, - guidance_tensor, - -1, - controls, - control_strength, - &out_cond); - } else { - diffusion_model->compute(n_threads, - noised_input, - timesteps, - id_cond.c_crossattn, - cond.c_concat, - id_cond.c_vector, - guidance_tensor, - -1, - controls, - control_strength, - &out_cond); } + diffusion_model->compute(n_threads, + noised_input, + timesteps, + uncond.c_crossattn, + uncond.c_concat, + uncond.c_vector, + guidance_tensor, + -1, + controls, + control_strength, + &out_uncond); + negative_data = (float*)out_uncond->data; + } - float* negative_data = NULL; + int step_count = sigmas.size(); + bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count); + float* skip_layer_data = NULL; + if (is_skiplayer_step) { + LOG_DEBUG("Skipping layers at step %d\n", step); + // skip layer (same as conditionned) + diffusion_model->compute(n_threads, + noised_input, + timesteps, + cond.c_crossattn, + cond.c_concat, + cond.c_vector, + guidance_tensor, + -1, + controls, + control_strength, + &out_skip, + NULL, + skip_layers); + skip_layer_data = (float*)out_skip->data; + } + float* vec_denoised = (float*)denoised->data; + float* vec_input = (float*)input->data; + float* positive_data = (float*)out_cond->data; + int ne_elements = (int)ggml_nelements(denoised); + for (int i = 0; i < ne_elements; i++) { + float latent_result = positive_data[i]; if (has_unconditioned) { - // uncond - if (control_hint != NULL) { - control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector); - controls = control_net->controls; + // out_uncond + cfg_scale * (out_cond - out_uncond) + int64_t ne3 = out_cond->ne[3]; + if (min_cfg != cfg_scale && ne3 != 1) { + int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; + float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); + } else { + latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); } - diffusion_model->compute(n_threads, - noised_input, - timesteps, - uncond.c_crossattn, - uncond.c_concat, - uncond.c_vector, - guidance_tensor, - -1, - controls, - control_strength, - &out_uncond); - negative_data = (float*)out_uncond->data; } - - int step_count = sigmas.size(); - bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count); - float* skip_layer_data = NULL; if (is_skiplayer_step) { - LOG_DEBUG("Skipping layers at step %d\n", step); - // skip layer (same as conditionned) - diffusion_model->compute(n_threads, - noised_input, - timesteps, - cond.c_crossattn, - cond.c_concat, - cond.c_vector, - guidance_tensor, - -1, - controls, - control_strength, - &out_skip, - NULL, - skip_layers); - skip_layer_data = (float*)out_skip->data; - } - float* vec_denoised = (float*)denoised->data; - float* vec_input = (float*)input->data; - float* positive_data = (float*)out_cond->data; - int ne_elements = (int)ggml_nelements(denoised); - for (int i = 0; i < ne_elements; i++) { - float latent_result = positive_data[i]; - if (has_unconditioned) { - // out_uncond + cfg_scale * (out_cond - out_uncond) - int64_t ne3 = out_cond->ne[3]; - if (min_cfg != cfg_scale && ne3 != 1) { - int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; - float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); - } else { - latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); - } - } - if (is_skiplayer_step) { - latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; - } - // v = latent_result, eps = latent_result - // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) - vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; - } - int64_t t1 = ggml_time_us(); - if (step > 0) { - pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); - // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); + latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; } - if (noise_mask != nullptr) { - for (int64_t x = 0; x < denoised->ne[0]; x++) { - for (int64_t y = 0; y < denoised->ne[1]; y++) { - float mask = ggml_tensor_get_f32(noise_mask, x, y); - for (int64_t k = 0; k < denoised->ne[2]; k++) { - float init = ggml_tensor_get_f32(init_latent, x, y, k); - float den = ggml_tensor_get_f32(denoised, x, y, k); - ggml_tensor_set_f32(denoised, init + mask * (den - init), x, y, k); - } + // v = latent_result, eps = latent_result + // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) + vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; + } + int64_t t1 = ggml_time_us(); + if (step > 0) { + pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); + // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); + } + if (noise_mask != nullptr) { + for (int64_t x = 0; x < denoised->ne[0]; x++) { + for (int64_t y = 0; y < denoised->ne[1]; y++) { + float mask = ggml_tensor_get_f32(noise_mask, x, y); + for (int64_t k = 0; k < denoised->ne[2]; k++) { + float init = ggml_tensor_get_f32(init_latent, x, y, k); + float den = ggml_tensor_get_f32(denoised, x, y, k); + ggml_tensor_set_f32(denoised, init + mask * (den - init), x, y, k); } } } + } - return denoised; - }; + return denoised; + }; - sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta); + sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta); - x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); + x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); - if (control_net) { - control_net->free_control_ctx(); - control_net->free_compute_buffer(); - } - diffusion_model->free_compute_buffer(); - return x; - } - - // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding - ggml_tensor* StableDiffusionGGML::get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { - // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample - ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); - ggml_tensor_set_f32_randn(noise, rng); - // noise = load_tensor_from_file(work_ctx, "noise.bin"); - { - float mean = 0; - float logvar = 0; - float value = 0; - float std_ = 0; - for (int i = 0; i < latent->ne[3]; i++) { - for (int j = 0; j < latent->ne[2]; j++) { - for (int k = 0; k < latent->ne[1]; k++) { - for (int l = 0; l < latent->ne[0]; l++) { - mean = ggml_tensor_get_f32(moments, l, k, j, i); - logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i); - logvar = std::max(-30.0f, std::min(logvar, 20.0f)); - std_ = std::exp(0.5f * logvar); - value = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i); - value = value * scale_factor; - // printf("%d %d %d %d -> %f\n", i, j, k, l, value); - ggml_tensor_set_f32(latent, value, l, k, j, i); - } - } - } - } - } - return latent; + if (control_net) { + control_net->free_control_ctx(); + control_net->free_compute_buffer(); } + diffusion_model->free_compute_buffer(); + return x; +} - ggml_tensor* StableDiffusionGGML::compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) { - int64_t W = x->ne[0]; - int64_t H = x->ne[1]; - int64_t C = 8; - if (use_tiny_autoencoder) { - C = 4; - } else { - if (sd_version_is_sd3(version)) { - C = 32; - } else if (sd_version_is_flux(version)) { - C = 32; - } - } - ggml_tensor* result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, - decode ? (W * 8) : (W / 8), // width - decode ? (H * 8) : (H / 8), // height - decode ? 3 : C, - x->ne[3]); // channels - int64_t t0 = ggml_time_ms(); - if (!use_tiny_autoencoder) { - if (decode) { - ggml_tensor_scale(x, 1.0f / scale_factor); - } else { - ggml_tensor_scale_input(x); - } - if (vae_tiling && decode) { // TODO: support tiling vae encode - // split latent in 32x32 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - first_stage_model->compute(n_threads, in, decode, &out); - }; - sd_tiling(x, result, 8, 32, 0.5f, on_tiling); - } else { - first_stage_model->compute(n_threads, x, decode, &result); - } - first_stage_model->free_compute_buffer(); - if (decode) { - ggml_tensor_scale_output(result); - } - } else { - if (vae_tiling && decode) { // TODO: support tiling vae encode - // split latent in 64x64 tiles and compute in several steps - auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - tae_first_stage->compute(n_threads, in, decode, &out); - }; - sd_tiling(x, result, 8, 64, 0.5f, on_tiling); - } else { - tae_first_stage->compute(n_threads, x, decode, &result); - } - tae_first_stage->free_compute_buffer(); - } +// ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding +ggml_tensor* StableDiffusionGGML::get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { + return vae_sample(work_ctx, moments, this->scale_factor, this->rng); +} - int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000); - if (decode) { - ggml_tensor_clamp(result, 0.0f, 1.0f); - } - return result; - } +ggml_tensor* StableDiffusionGGML::compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) +{ +assert(!use_tiny_autoencoder); +return vae_run( + this->first_stage_model.get(), + work_ctx, + x, + decode, + this->version, + this->scale_factor, + this->vae_tiling, + this->n_threads); +} - ggml_tensor* StableDiffusionGGML::encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { - return compute_first_stage(work_ctx, x, false); - } +ggml_tensor* StableDiffusionGGML::encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { + return compute_first_stage(work_ctx, x, false); +} - ggml_tensor* StableDiffusionGGML::decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { - return compute_first_stage(work_ctx, x, true); - } +ggml_tensor* StableDiffusionGGML::decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) { + return compute_first_stage(work_ctx, x, true); +} diff --git a/stable-diffusion.hpp b/stable-diffusion.hpp index f4373a840..e79769b46 100644 --- a/stable-diffusion.hpp +++ b/stable-diffusion.hpp @@ -22,6 +22,21 @@ /*================================================== Helper Functions ================================================*/ +ggml_tensor* vae_sample(ggml_context* work_ctx, + ggml_tensor* moments, + float scale_factor, + std::shared_ptr rng); + +ggml_tensor* vae_run( + AutoEncoderKL *first_stage_model, + ggml_context* work_ctx, + ggml_tensor* x, + bool decode, + SDVersion version, + float scale_factor, + bool vae_tiling, + int n_threads); + void calculate_alphas_cumprod(float* alphas_cumprod, float linear_start = 0.00085f, float linear_end = 0.0120, From f53bd06b4edaba363f3aa9f746a82c826d283848 Mon Sep 17 00:00:00 2001 From: "Harris M. Snyder" Date: Sun, 8 Jun 2025 15:59:53 -0400 Subject: [PATCH 5/5] cleanup cpp/hpp split --- stable-diffusion.cpp | 15 +++++++++++++++ stable-diffusion.hpp | 12 +----------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 1f124849a..c86d84100 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -6,6 +6,21 @@ #include "stb_image.h" +void calculate_alphas_cumprod(float* alphas_cumprod, + float linear_start, + float linear_end, + int timesteps) { + float ls_sqrt = sqrtf(linear_start); + float le_sqrt = sqrtf(linear_end); + float amount = le_sqrt - ls_sqrt; + float product = 1.0f; + for (int i = 0; i < timesteps; i++) { + float beta = ls_sqrt + amount * ((float)i / (timesteps - 1)); + product *= 1.0f - powf(beta, 2.0f); + alphas_cumprod[i] = product; + } +} + // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding ggml_tensor* vae_sample(ggml_context* work_ctx, ggml_tensor* moments, float scale_factor, std::shared_ptr rng) { diff --git a/stable-diffusion.hpp b/stable-diffusion.hpp index e79769b46..773447ca2 100644 --- a/stable-diffusion.hpp +++ b/stable-diffusion.hpp @@ -40,17 +40,7 @@ ggml_tensor* vae_run( void calculate_alphas_cumprod(float* alphas_cumprod, float linear_start = 0.00085f, float linear_end = 0.0120, - int timesteps = TIMESTEPS) { - float ls_sqrt = sqrtf(linear_start); - float le_sqrt = sqrtf(linear_end); - float amount = le_sqrt - ls_sqrt; - float product = 1.0f; - for (int i = 0; i < timesteps; i++) { - float beta = ls_sqrt + amount * ((float)i / (timesteps - 1)); - product *= 1.0f - powf(beta, 2.0f); - alphas_cumprod[i] = product; - } -} + int timesteps = TIMESTEPS); struct StableDiffusionLoadConfiguration {