From 4eef4c8fa827cab9c36465039e94ba20c5cb5bf1 Mon Sep 17 00:00:00 2001
From: "Harris M. Snyder" <harris.snyder@gmail.com>
Date: Sat, 7 Jun 2025 21:28:58 -0400
Subject: [PATCH 1/5] refactor to expose c++ internals

---
 .gitignore           |   3 +-
 CMakeLists.txt       |   8 +-
 stable-diffusion.cpp | 183 +++++++++++++------------------------------
 stable-diffusion.hpp | 170 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 234 insertions(+), 130 deletions(-)
 create mode 100644 stable-diffusion.hpp
diff --git a/.gitignore b/.gitignore
index 38fe570df..4274effab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,5 @@ test/
 *.gguf
 output*.png
 models*
-*.log
\ No newline at end of file
+*.log
+.idea/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 782a893e4..6d7d30c35 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -117,7 +117,13 @@ add_definitions(-DGGML_MAX_NAME=128)
 # deps
 # Only add ggml if it hasn't been added yet
 if (NOT TARGET ggml)
-    add_subdirectory(ggml)
+    include(FetchContent)
+    FetchContent_Declare(
+            ggml
+            GIT_REPOSITORY https://github.com/ggerganov/ggml.git
+            GIT_TAG ff9052988b76e137bcf92bb335733933ca196ac0
+    )
+    FetchContent_MakeAvailable(ggml)
 endif()
 
 add_subdirectory(thirdparty)
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index e38a6101f..d8a9fd10c 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1,25 +1,12 @@
-#include "ggml_extend.hpp"
-
-#include "model.h"
-#include "rng.hpp"
-#include "rng_philox.hpp"
-#include "stable-diffusion.h"
-#include "util.h"
-
-#include "conditioner.hpp"
-#include "control.hpp"
-#include "denoiser.hpp"
-#include "diffusion_model.hpp"
-#include "esrgan.hpp"
-#include "lora.hpp"
-#include "pmid.hpp"
-#include "tae.hpp"
-#include "vae.hpp"
+#include "stable-diffusion.hpp"
+
 
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
 #include "stb_image.h"
 
+
+
 // #define STB_IMAGE_WRITE_IMPLEMENTATION
 // #define STB_IMAGE_WRITE_STATIC
 // #include "stb_image_write.h"
@@ -51,99 +38,36 @@ const char* sampling_methods_str[] = {
     "TCD"
 };
 
-/*================================================== Helper Functions ================================================*/
-
-void calculate_alphas_cumprod(float* alphas_cumprod,
-                              float linear_start = 0.00085f,
-                              float linear_end   = 0.0120,
-                              int timesteps      = TIMESTEPS) {
-    float ls_sqrt = sqrtf(linear_start);
-    float le_sqrt = sqrtf(linear_end);
-    float amount  = le_sqrt - ls_sqrt;
-    float product = 1.0f;
-    for (int i = 0; i < timesteps; i++) {
-        float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
-        product *= 1.0f - powf(beta, 2.0f);
-        alphas_cumprod[i] = product;
+StableDiffusionGGML::StableDiffusionGGML(int n_threads,
+                    bool vae_decode_only,
+                    bool free_params_immediately,
+                    std::string lora_model_dir,
+                    rng_type_t rng_type)
+    : n_threads(n_threads),
+      vae_decode_only(vae_decode_only),
+      free_params_immediately(free_params_immediately),
+      lora_model_dir(lora_model_dir) {
+    if (rng_type == STD_DEFAULT_RNG) {
+        rng = std::make_shared<STDDefaultRNG>();
+    } else if (rng_type == CUDA_RNG) {
+        rng = std::make_shared<PhiloxRNG>();
     }
 }
 
-/*=============================================== StableDiffusionGGML ================================================*/
-
-class StableDiffusionGGML {
-public:
-    ggml_backend_t backend             = NULL;  // general backend
-    ggml_backend_t clip_backend        = NULL;
-    ggml_backend_t control_net_backend = NULL;
-    ggml_backend_t vae_backend         = NULL;
-    ggml_type model_wtype              = GGML_TYPE_COUNT;
-    ggml_type conditioner_wtype        = GGML_TYPE_COUNT;
-    ggml_type diffusion_model_wtype    = GGML_TYPE_COUNT;
-    ggml_type vae_wtype                = GGML_TYPE_COUNT;
-
-    SDVersion version;
-    bool vae_decode_only         = false;
-    bool free_params_immediately = false;
-
-    std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
-    int n_threads            = -1;
-    float scale_factor       = 0.18215f;
-
-    std::shared_ptr<Conditioner> cond_stage_model;
-    std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision;  // for svd
-    std::shared_ptr<DiffusionModel> diffusion_model;
-    std::shared_ptr<AutoEncoderKL> first_stage_model;
-    std::shared_ptr<TinyAutoEncoder> tae_first_stage;
-    std::shared_ptr<ControlNet> control_net;
-    std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
-    std::shared_ptr<LoraModel> pmid_lora;
-    std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
-
-    std::string taesd_path;
-    bool use_tiny_autoencoder = false;
-    bool vae_tiling           = false;
-    bool stacked_id           = false;
-
-    std::map<std::string, struct ggml_tensor*> tensors;
-
-    std::string lora_model_dir;
-    // lora_name => multiplier
-    std::unordered_map<std::string, float> curr_lora_state;
-
-    std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
-
-    StableDiffusionGGML() = default;
-
-    StableDiffusionGGML(int n_threads,
-                        bool vae_decode_only,
-                        bool free_params_immediately,
-                        std::string lora_model_dir,
-                        rng_type_t rng_type)
-        : n_threads(n_threads),
-          vae_decode_only(vae_decode_only),
-          free_params_immediately(free_params_immediately),
-          lora_model_dir(lora_model_dir) {
-        if (rng_type == STD_DEFAULT_RNG) {
-            rng = std::make_shared<STDDefaultRNG>();
-        } else if (rng_type == CUDA_RNG) {
-            rng = std::make_shared<PhiloxRNG>();
-        }
+StableDiffusionGGML::~StableDiffusionGGML() {
+    if (clip_backend != backend) {
+        ggml_backend_free(clip_backend);
     }
-
-    ~StableDiffusionGGML() {
-        if (clip_backend != backend) {
-            ggml_backend_free(clip_backend);
-        }
-        if (control_net_backend != backend) {
-            ggml_backend_free(control_net_backend);
-        }
-        if (vae_backend != backend) {
-            ggml_backend_free(vae_backend);
-        }
-        ggml_backend_free(backend);
+    if (control_net_backend != backend) {
+        ggml_backend_free(control_net_backend);
     }
+    if (vae_backend != backend) {
+        ggml_backend_free(vae_backend);
+    }
+    ggml_backend_free(backend);
+}
 
-    bool load_from_file(const std::string& model_path,
+bool StableDiffusionGGML::load_from_file(const std::string& model_path,
                         const std::string& clip_l_path,
                         const std::string& clip_g_path,
                         const std::string& t5xxl_path,
@@ -602,7 +526,7 @@ class StableDiffusionGGML {
         return true;
     }
 
-    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) {
+    bool StableDiffusionGGML::is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint) {
         struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
         ggml_set_f32(x_t, 0.5);
         struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
@@ -638,7 +562,7 @@ class StableDiffusionGGML {
         return result < -1;
     }
 
-    void apply_lora(const std::string& lora_name, float multiplier) {
+    void StableDiffusionGGML::apply_lora(const std::string& lora_name, float multiplier) {
         int64_t t0                 = ggml_time_ms();
         std::string st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
         std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
@@ -667,7 +591,7 @@ class StableDiffusionGGML {
         LOG_INFO("lora '%s' applied, taking %.2fs", lora_name.c_str(), (t1 - t0) * 1.0f / 1000);
     }
 
-    void apply_loras(const std::unordered_map<std::string, float>& lora_state) {
+    void StableDiffusionGGML::apply_loras(const std::unordered_map<std::string, float>& lora_state) {
         if (lora_state.size() > 0 && model_wtype != GGML_TYPE_F16 && model_wtype != GGML_TYPE_F32) {
             LOG_WARN("In quantized models when applying LoRA, the images have poor quality.");
         }
@@ -682,7 +606,7 @@ class StableDiffusionGGML {
             float curr_multiplier        = kv.second;
             lora_state_diff[lora_name] -= curr_multiplier;
         }
-        
+
         size_t rm = lora_state_diff.size() - lora_state.size();
         if (rm != 0) {
             LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm);
@@ -697,7 +621,7 @@ class StableDiffusionGGML {
         curr_lora_state = lora_state;
     }
 
-    ggml_tensor* id_encoder(ggml_context* work_ctx,
+    ggml_tensor* StableDiffusionGGML::id_encoder(ggml_context* work_ctx,
                             ggml_tensor* init_img,
                             ggml_tensor* prompts_embeds,
                             ggml_tensor* id_embeds,
@@ -707,14 +631,14 @@ class StableDiffusionGGML {
         return res;
     }
 
-    SDCondition get_svd_condition(ggml_context* work_ctx,
+    SDCondition StableDiffusionGGML::get_svd_condition(ggml_context* work_ctx,
                                   sd_image_t init_image,
                                   int width,
                                   int height,
-                                  int fps                    = 6,
-                                  int motion_bucket_id       = 127,
-                                  float augmentation_level   = 0.f,
-                                  bool force_zero_embeddings = false) {
+                                  int fps                    ,
+                                  int motion_bucket_id       ,
+                                  float augmentation_level   ,
+                                  bool force_zero_embeddings ) {
         // c_crossattn
         int64_t t0                      = ggml_time_ms();
         struct ggml_tensor* c_crossattn = NULL;
@@ -785,7 +709,7 @@ class StableDiffusionGGML {
         return {c_crossattn, y, c_concat};
     }
 
-    ggml_tensor* sample(ggml_context* work_ctx,
+    ggml_tensor* StableDiffusionGGML::sample(ggml_context* work_ctx,
                         ggml_tensor* init_latent,
                         ggml_tensor* noise,
                         SDCondition cond,
@@ -800,11 +724,11 @@ class StableDiffusionGGML {
                         const std::vector<float>& sigmas,
                         int start_merge_step,
                         SDCondition id_cond,
-                        std::vector<int> skip_layers = {},
-                        float slg_scale              = 0,
-                        float skip_layer_start       = 0.01,
-                        float skip_layer_end         = 0.2,
-                        ggml_tensor* noise_mask      = nullptr) {
+                        std::vector<int> skip_layers ,
+                        float slg_scale              ,
+                        float skip_layer_start       ,
+                        float skip_layer_end         ,
+                        ggml_tensor* noise_mask      ) {
         LOG_DEBUG("Sample");
         struct ggml_init_params params;
         size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
@@ -1004,7 +928,7 @@ class StableDiffusionGGML {
     }
 
     // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
-    ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
+    ggml_tensor* StableDiffusionGGML::get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
         // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
         ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
         struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
@@ -1035,7 +959,7 @@ class StableDiffusionGGML {
         return latent;
     }
 
-    ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) {
+    ggml_tensor* StableDiffusionGGML::compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) {
         int64_t W = x->ne[0];
         int64_t H = x->ne[1];
         int64_t C = 8;
@@ -1094,20 +1018,22 @@ class StableDiffusionGGML {
         return result;
     }
 
-    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
+    ggml_tensor* StableDiffusionGGML::encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
         return compute_first_stage(work_ctx, x, false);
     }
 
-    ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
+    ggml_tensor* StableDiffusionGGML::decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
         return compute_first_stage(work_ctx, x, true);
     }
-};
+
+
+
+
+
 
 /*================================================= SD API ==================================================*/
 
-struct sd_ctx_t {
-    StableDiffusionGGML* sd = NULL;
-};
+
 
 sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
                      const char* clip_l_path_c_str,
@@ -1943,3 +1869,4 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
 
     return result_images;
 }
+
diff --git a/stable-diffusion.hpp b/stable-diffusion.hpp
new file mode 100644
index 000000000..177dae178
--- /dev/null
+++ b/stable-diffusion.hpp
@@ -0,0 +1,170 @@
+#ifndef STABLE_DIFFUSION_HPP
+#define STABLE_DIFFUSION_HPP
+#include "ggml_extend.hpp"
+
+#include "model.h"
+#include "rng.hpp"
+#include "rng_philox.hpp"
+#include "stable-diffusion.h"
+#include "util.h"
+
+#include "conditioner.hpp"
+#include "control.hpp"
+#include "denoiser.hpp"
+#include "diffusion_model.hpp"
+#include "esrgan.hpp"
+#include "lora.hpp"
+#include "pmid.hpp"
+#include "tae.hpp"
+#include "vae.hpp"
+
+
+
+/*================================================== Helper Functions ================================================*/
+
+void calculate_alphas_cumprod(float* alphas_cumprod,
+                              float linear_start = 0.00085f,
+                              float linear_end   = 0.0120,
+                              int timesteps      = TIMESTEPS) {
+    float ls_sqrt = sqrtf(linear_start);
+    float le_sqrt = sqrtf(linear_end);
+    float amount  = le_sqrt - ls_sqrt;
+    float product = 1.0f;
+    for (int i = 0; i < timesteps; i++) {
+        float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
+        product *= 1.0f - powf(beta, 2.0f);
+        alphas_cumprod[i] = product;
+    }
+}
+
+/*=============================================== StableDiffusionGGML ================================================*/
+
+class StableDiffusionGGML {
+public:
+    ggml_backend_t backend             = NULL;  // general backend
+    ggml_backend_t clip_backend        = NULL;
+    ggml_backend_t control_net_backend = NULL;
+    ggml_backend_t vae_backend         = NULL;
+    ggml_type model_wtype              = GGML_TYPE_COUNT;
+    ggml_type conditioner_wtype        = GGML_TYPE_COUNT;
+    ggml_type diffusion_model_wtype    = GGML_TYPE_COUNT;
+    ggml_type vae_wtype                = GGML_TYPE_COUNT;
+
+    SDVersion version;
+    bool vae_decode_only         = false;
+    bool free_params_immediately = false;
+
+    std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
+    int n_threads            = -1;
+    float scale_factor       = 0.18215f;
+
+    std::shared_ptr<Conditioner> cond_stage_model;
+    std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision;  // for svd
+    std::shared_ptr<DiffusionModel> diffusion_model;
+    std::shared_ptr<AutoEncoderKL> first_stage_model;
+    std::shared_ptr<TinyAutoEncoder> tae_first_stage;
+    std::shared_ptr<ControlNet> control_net;
+    std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
+    std::shared_ptr<LoraModel> pmid_lora;
+    std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
+
+    std::string taesd_path;
+    bool use_tiny_autoencoder = false;
+    bool vae_tiling           = false;
+    bool stacked_id           = false;
+
+    std::map<std::string, struct ggml_tensor*> tensors;
+
+    std::string lora_model_dir;
+    // lora_name => multiplier
+    std::unordered_map<std::string, float> curr_lora_state;
+
+    std::shared_ptr<Denoiser> denoiser = std::make_shared<CompVisDenoiser>();
+
+    StableDiffusionGGML() = default;
+
+    StableDiffusionGGML(int n_threads,
+                        bool vae_decode_only,
+                        bool free_params_immediately,
+                        std::string lora_model_dir,
+                        rng_type_t rng_type);
+
+
+    ~StableDiffusionGGML() ;
+
+    bool load_from_file(const std::string& model_path,
+                        const std::string& clip_l_path,
+                        const std::string& clip_g_path,
+                        const std::string& t5xxl_path,
+                        const std::string& diffusion_model_path,
+                        const std::string& vae_path,
+                        const std::string control_net_path,
+                        const std::string embeddings_path,
+                        const std::string id_embeddings_path,
+                        const std::string& taesd_path,
+                        bool vae_tiling_,
+                        ggml_type wtype,
+                        schedule_t schedule,
+                        bool clip_on_cpu,
+                        bool control_net_cpu,
+                        bool vae_on_cpu,
+                        bool diffusion_flash_attn);
+
+    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false);
+
+    void apply_lora(const std::string& lora_name, float multiplier);
+
+    void apply_loras(const std::unordered_map<std::string, float>& lora_state);
+
+    ggml_tensor* id_encoder(ggml_context* work_ctx,
+                            ggml_tensor* init_img,
+                            ggml_tensor* prompts_embeds,
+                            ggml_tensor* id_embeds,
+                            std::vector<bool>& class_tokens_mask);
+
+    SDCondition get_svd_condition(ggml_context* work_ctx,
+                                  sd_image_t init_image,
+                                  int width,
+                                  int height,
+                                  int fps                    = 6,
+                                  int motion_bucket_id       = 127,
+                                  float augmentation_level   = 0.f,
+                                  bool force_zero_embeddings = false);
+
+    ggml_tensor* sample(ggml_context* work_ctx,
+                        ggml_tensor* init_latent,
+                        ggml_tensor* noise,
+                        SDCondition cond,
+                        SDCondition uncond,
+                        ggml_tensor* control_hint,
+                        float control_strength,
+                        float min_cfg,
+                        float cfg_scale,
+                        float guidance,
+                        float eta,
+                        sample_method_t method,
+                        const std::vector<float>& sigmas,
+                        int start_merge_step,
+                        SDCondition id_cond,
+                        std::vector<int> skip_layers = {},
+                        float slg_scale              = 0,
+                        float skip_layer_start       = 0.01,
+                        float skip_layer_end         = 0.2,
+                        ggml_tensor* noise_mask      = nullptr);
+
+    // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
+    ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments);
+    ggml_tensor* compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode);
+
+    ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x);
+
+    ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x);
+};
+
+/*================================================= SD API ==================================================*/
+
+struct sd_ctx_t {
+    StableDiffusionGGML* sd = NULL;
+};
+
+#endif // STABLE_DIFFUSION_HPP
\ No newline at end of file

From 105c67d63dcc03cf2a84025d739638dd8c0dbf2c Mon Sep 17 00:00:00 2001
From: "Harris M. Snyder" <harris.snyder@gmail.com>
Date: Sat, 7 Jun 2025 21:30:55 -0400
Subject: [PATCH 2/5] purge submodules

---
 .gitmodules | 3 ---
 ggml        | 1 -
 2 files changed, 4 deletions(-)
 delete mode 100644 .gitmodules
 delete mode 160000 ggml

diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index d9d943713..000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "ggml"]
-    path = ggml
-	url = https://github.com/ggerganov/ggml.git
diff --git a/ggml b/ggml
deleted file mode 160000
index ff9052988..000000000
--- a/ggml
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit ff9052988b76e137bcf92bb335733933ca196ac0

From d7e7997c5505f689b80c61dd4618f9a9a7b3e143 Mon Sep 17 00:00:00 2001
From: "Harris M. Snyder" <harris.snyder@gmail.com>
Date: Sat, 7 Jun 2025 22:26:12 -0400
Subject: [PATCH 3/5] allow skip unet load

---
 stable-diffusion.cpp | 11 ++++++++---
 stable-diffusion.hpp | 10 +++++++++-
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index d8a9fd10c..6fe712fff 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -83,7 +83,8 @@ bool StableDiffusionGGML::load_from_file(const std::string& model_path,
                         bool clip_on_cpu,
                         bool control_net_cpu,
                         bool vae_on_cpu,
-                        bool diffusion_flash_attn) {
+                        bool diffusion_flash_attn,
+                        StableDiffusionLoadConfiguration details) {
         use_tiny_autoencoder = taesd_path.size() > 0;
 #ifdef SD_USE_CUDA
         LOG_DEBUG("Using CUDA backend");
@@ -272,8 +273,12 @@ bool StableDiffusionGGML::load_from_file(const std::string& model_path,
             cond_stage_model->alloc_params_buffer();
             cond_stage_model->get_param_tensors(tensors);
 
-            diffusion_model->alloc_params_buffer();
-            diffusion_model->get_param_tensors(tensors);
+            if(!details.skip_unet) {
+                diffusion_model->alloc_params_buffer();
+                diffusion_model->get_param_tensors(tensors);
+            }
+            assert(!details.skip_vae);// not implemented yet
+            assert(!details.skip_text_encoders);// not implemented yet
 
             if (!use_tiny_autoencoder) {
                 if (vae_on_cpu && !ggml_backend_is_cpu(backend)) {
diff --git a/stable-diffusion.hpp b/stable-diffusion.hpp
index 177dae178..f4373a840 100644
--- a/stable-diffusion.hpp
+++ b/stable-diffusion.hpp
@@ -37,6 +37,13 @@ void calculate_alphas_cumprod(float* alphas_cumprod,
     }
 }
 
+struct StableDiffusionLoadConfiguration
+{
+	bool skip_unet = false;
+	bool skip_vae = false;
+	bool skip_text_encoders = false;
+};
+
 /*=============================================== StableDiffusionGGML ================================================*/
 
 class StableDiffusionGGML {
@@ -108,7 +115,8 @@ class StableDiffusionGGML {
                         bool clip_on_cpu,
                         bool control_net_cpu,
                         bool vae_on_cpu,
-                        bool diffusion_flash_attn);
+                        bool diffusion_flash_attn,
+						StableDiffusionLoadConfiguration details = StableDiffusionLoadConfiguration());
 
     bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false);
 

From 8a63b34694d36cfb6999f5ca95b3593556d41be7 Mon Sep 17 00:00:00 2001
From: "Harris M. Snyder" <harris.snyder@gmail.com>
Date: Sun, 8 Jun 2025 15:36:52 -0400
Subject: [PATCH 4/5] factor out vae inference functions

---
 stable-diffusion.cpp | 907 ++++++++++++++++++++++---------------------
 stable-diffusion.hpp |  15 +
 2 files changed, 476 insertions(+), 446 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 6fe712fff..1f124849a 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -6,6 +6,93 @@
 #include "stb_image.h"
 
 
+// ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
+ggml_tensor* vae_sample(ggml_context* work_ctx, ggml_tensor* moments, float scale_factor, std::shared_ptr<RNG> rng)
+{
+    // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
+    ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
+    struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
+    ggml_tensor_set_f32_randn(noise, rng);
+    // noise = load_tensor_from_file(work_ctx, "noise.bin");
+    {
+        float mean   = 0;
+        float logvar = 0;
+        float value  = 0;
+        float std_   = 0;
+        for (int i = 0; i < latent->ne[3]; i++) {
+            for (int j = 0; j < latent->ne[2]; j++) {
+                for (int k = 0; k < latent->ne[1]; k++) {
+                    for (int l = 0; l < latent->ne[0]; l++) {
+                        mean   = ggml_tensor_get_f32(moments, l, k, j, i);
+                        logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
+                        logvar = std::max(-30.0f, std::min(logvar, 20.0f));
+                        std_   = std::exp(0.5f * logvar);
+                        value  = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
+                        value  = value * scale_factor;
+                        // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
+                        ggml_tensor_set_f32(latent, value, l, k, j, i);
+                    }
+                }
+            }
+        }
+    }
+    return latent;
+}
+
+ggml_tensor* vae_run(
+    AutoEncoderKL *first_stage_model,
+    ggml_context* work_ctx,
+    ggml_tensor* x,
+    bool decode,
+    SDVersion version,
+    float scale_factor,
+    bool vae_tiling,
+    int n_threads)
+{
+    // I ripped out the tiny autoencoder code paths from here... sorry
+    int64_t W = x->ne[0];
+    int64_t H = x->ne[1];
+    int64_t C = 8;
+
+    if (sd_version_is_sd3(version)) {
+        C = 32;
+    } else if (sd_version_is_flux(version)) {
+        C = 32;
+    }
+
+    ggml_tensor* result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
+                                             decode ? (W * 8) : (W / 8),  // width
+                                             decode ? (H * 8) : (H / 8),  // height
+                                             decode ? 3 : C,
+                                             x->ne[3]);  // channels
+    int64_t t0          = ggml_time_ms();
+    if (decode) {
+        ggml_tensor_scale(x, 1.0f / scale_factor);
+    } else {
+        ggml_tensor_scale_input(x);
+    }
+    if (vae_tiling && decode) {  // TODO: support tiling vae encode
+        // split latent in 32x32 tiles and compute in several steps
+        auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+            first_stage_model->compute(n_threads, in, decode, &out);
+        };
+        sd_tiling(x, result, 8, 32, 0.5f, on_tiling);
+    } else {
+        first_stage_model->compute(n_threads, x, decode, &result);
+    }
+    first_stage_model->free_compute_buffer();
+    if (decode) {
+        ggml_tensor_scale_output(result);
+    }
+
+    int64_t t1 = ggml_time_ms();
+    LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000);
+    if (decode) {
+        ggml_tensor_clamp(result, 0.0f, 1.0f);
+    }
+    return result;
+}
+
 
 // #define STB_IMAGE_WRITE_IMPLEMENTATION
 // #define STB_IMAGE_WRITE_STATIC
@@ -531,505 +618,433 @@ bool StableDiffusionGGML::load_from_file(const std::string& model_path,
         return true;
     }
 
-    bool StableDiffusionGGML::is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint) {
-        struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
-        ggml_set_f32(x_t, 0.5);
-        struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
-        ggml_set_f32(c, 0.5);
+bool StableDiffusionGGML::is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint) {
+    struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
+    ggml_set_f32(x_t, 0.5);
+    struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
+    ggml_set_f32(c, 0.5);
 
-        struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);
-        ggml_set_f32(timesteps, 999);
+    struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);
+    ggml_set_f32(timesteps, 999);
 
-        struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL;
-        if (concat != NULL) {
-            ggml_set_f32(concat, 0);
-        }
+    struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL;
+    if (concat != NULL) {
+        ggml_set_f32(concat, 0);
+    }
 
-        int64_t t0              = ggml_time_ms();
-        struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
-        diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out);
-        diffusion_model->free_compute_buffer();
+    int64_t t0              = ggml_time_ms();
+    struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
+    diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out);
+    diffusion_model->free_compute_buffer();
 
-        double result = 0.f;
-        {
-            float* vec_x   = (float*)x_t->data;
-            float* vec_out = (float*)out->data;
+    double result = 0.f;
+    {
+        float* vec_x   = (float*)x_t->data;
+        float* vec_out = (float*)out->data;
 
-            int64_t n = ggml_nelements(out);
+        int64_t n = ggml_nelements(out);
 
-            for (int i = 0; i < n; i++) {
-                result += ((double)vec_out[i] - (double)vec_x[i]);
-            }
-            result /= n;
-        }
-        int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-        return result < -1;
-    }
-
-    void StableDiffusionGGML::apply_lora(const std::string& lora_name, float multiplier) {
-        int64_t t0                 = ggml_time_ms();
-        std::string st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
-        std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
-        std::string file_path;
-        if (file_exists(st_file_path)) {
-            file_path = st_file_path;
-        } else if (file_exists(ckpt_file_path)) {
-            file_path = ckpt_file_path;
-        } else {
-            LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
-            return;
-        }
-        LoraModel lora(backend, file_path);
-        if (!lora.load_from_file()) {
-            LOG_WARN("load lora tensors from %s failed", file_path.c_str());
-            return;
+        for (int i = 0; i < n; i++) {
+            result += ((double)vec_out[i] - (double)vec_x[i]);
         }
+        result /= n;
+    }
+    int64_t t1 = ggml_time_ms();
+    LOG_DEBUG("check is_using_v_parameterization_for_sd2, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+    return result < -1;
+}
 
-        lora.multiplier = multiplier;
-        // TODO: send version?
-        lora.apply(tensors, version, n_threads);
-        lora.free_params_buffer();
+void StableDiffusionGGML::apply_lora(const std::string& lora_name, float multiplier) {
+    int64_t t0                 = ggml_time_ms();
+    std::string st_file_path   = path_join(lora_model_dir, lora_name + ".safetensors");
+    std::string ckpt_file_path = path_join(lora_model_dir, lora_name + ".ckpt");
+    std::string file_path;
+    if (file_exists(st_file_path)) {
+        file_path = st_file_path;
+    } else if (file_exists(ckpt_file_path)) {
+        file_path = ckpt_file_path;
+    } else {
+        LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
+        return;
+    }
+    LoraModel lora(backend, file_path);
+    if (!lora.load_from_file()) {
+        LOG_WARN("load lora tensors from %s failed", file_path.c_str());
+        return;
+    }
 
-        int64_t t1 = ggml_time_ms();
+    lora.multiplier = multiplier;
+    // TODO: send version?
+    lora.apply(tensors, version, n_threads);
+    lora.free_params_buffer();
+
+    int64_t t1 = ggml_time_ms();
 
-        LOG_INFO("lora '%s' applied, taking %.2fs", lora_name.c_str(), (t1 - t0) * 1.0f / 1000);
+    LOG_INFO("lora '%s' applied, taking %.2fs", lora_name.c_str(), (t1 - t0) * 1.0f / 1000);
+}
+
+void StableDiffusionGGML::apply_loras(const std::unordered_map<std::string, float>& lora_state) {
+    if (lora_state.size() > 0 && model_wtype != GGML_TYPE_F16 && model_wtype != GGML_TYPE_F32) {
+        LOG_WARN("In quantized models when applying LoRA, the images have poor quality.");
+    }
+    std::unordered_map<std::string, float> lora_state_diff;
+    for (auto& kv : lora_state) {
+        const std::string& lora_name = kv.first;
+        float multiplier             = kv.second;
+        lora_state_diff[lora_name] += multiplier;
+    }
+    for (auto& kv : curr_lora_state) {
+        const std::string& lora_name = kv.first;
+        float curr_multiplier        = kv.second;
+        lora_state_diff[lora_name] -= curr_multiplier;
     }
 
-    void StableDiffusionGGML::apply_loras(const std::unordered_map<std::string, float>& lora_state) {
-        if (lora_state.size() > 0 && model_wtype != GGML_TYPE_F16 && model_wtype != GGML_TYPE_F32) {
-            LOG_WARN("In quantized models when applying LoRA, the images have poor quality.");
-        }
-        std::unordered_map<std::string, float> lora_state_diff;
-        for (auto& kv : lora_state) {
-            const std::string& lora_name = kv.first;
-            float multiplier             = kv.second;
-            lora_state_diff[lora_name] += multiplier;
-        }
-        for (auto& kv : curr_lora_state) {
-            const std::string& lora_name = kv.first;
-            float curr_multiplier        = kv.second;
-            lora_state_diff[lora_name] -= curr_multiplier;
-        }
+    size_t rm = lora_state_diff.size() - lora_state.size();
+    if (rm != 0) {
+        LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm);
+    } else {
+        LOG_INFO("Attempting to apply %lu LoRAs", lora_state.size());
+    }
+
+    for (auto& kv : lora_state_diff) {
+        apply_lora(kv.first, kv.second);
+    }
+
+    curr_lora_state = lora_state;
+}
+
+ggml_tensor* StableDiffusionGGML::id_encoder(ggml_context* work_ctx,
+                        ggml_tensor* init_img,
+                        ggml_tensor* prompts_embeds,
+                        ggml_tensor* id_embeds,
+                        std::vector<bool>& class_tokens_mask) {
+    ggml_tensor* res = NULL;
+    pmid_model->compute(n_threads, init_img, prompts_embeds, id_embeds, class_tokens_mask, &res, work_ctx);
+    return res;
+}
 
-        size_t rm = lora_state_diff.size() - lora_state.size();
-        if (rm != 0) {
-            LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm);
+SDCondition StableDiffusionGGML::get_svd_condition(ggml_context* work_ctx,
+                              sd_image_t init_image,
+                              int width,
+                              int height,
+                              int fps                    ,
+                              int motion_bucket_id       ,
+                              float augmentation_level   ,
+                              bool force_zero_embeddings ) {
+    // c_crossattn
+    int64_t t0                      = ggml_time_ms();
+    struct ggml_tensor* c_crossattn = NULL;
+    {
+        if (force_zero_embeddings) {
+            c_crossattn = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, clip_vision->vision_model.projection_dim);
+            ggml_set_f32(c_crossattn, 0.f);
         } else {
-            LOG_INFO("Attempting to apply %lu LoRAs", lora_state.size());
+            sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
+            sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size);
+            free(image.data);
+            image.data = NULL;
+
+            ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
+            sd_image_f32_to_tensor(resized_image.data, pixel_values, false);
+            free(resized_image.data);
+            resized_image.data = NULL;
+
+            // print_ggml_tensor(pixel_values);
+            clip_vision->compute(n_threads, pixel_values, &c_crossattn, work_ctx);
+            // print_ggml_tensor(c_crossattn);
         }
+    }
 
-        for (auto& kv : lora_state_diff) {
-            apply_lora(kv.first, kv.second);
-        }
+    // c_concat
+    struct ggml_tensor* c_concat = NULL;
+    {
+        if (force_zero_embeddings) {
+            c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 4, 1);
+            ggml_set_f32(c_concat, 0.f);
+        } else {
+            ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
 
-        curr_lora_state = lora_state;
-    }
-
-    ggml_tensor* StableDiffusionGGML::id_encoder(ggml_context* work_ctx,
-                            ggml_tensor* init_img,
-                            ggml_tensor* prompts_embeds,
-                            ggml_tensor* id_embeds,
-                            std::vector<bool>& class_tokens_mask) {
-        ggml_tensor* res = NULL;
-        pmid_model->compute(n_threads, init_img, prompts_embeds, id_embeds, class_tokens_mask, &res, work_ctx);
-        return res;
-    }
-
-    SDCondition StableDiffusionGGML::get_svd_condition(ggml_context* work_ctx,
-                                  sd_image_t init_image,
-                                  int width,
-                                  int height,
-                                  int fps                    ,
-                                  int motion_bucket_id       ,
-                                  float augmentation_level   ,
-                                  bool force_zero_embeddings ) {
-        // c_crossattn
-        int64_t t0                      = ggml_time_ms();
-        struct ggml_tensor* c_crossattn = NULL;
-        {
-            if (force_zero_embeddings) {
-                c_crossattn = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, clip_vision->vision_model.projection_dim);
-                ggml_set_f32(c_crossattn, 0.f);
-            } else {
+            if (width != init_image.width || height != init_image.height) {
                 sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
-                sd_image_f32_t resized_image = clip_preprocess(image, clip_vision->vision_model.image_size);
+                sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height);
                 free(image.data);
                 image.data = NULL;
-
-                ggml_tensor* pixel_values = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
-                sd_image_f32_to_tensor(resized_image.data, pixel_values, false);
+                sd_image_f32_to_tensor(resized_image.data, init_img, false);
                 free(resized_image.data);
                 resized_image.data = NULL;
-
-                // print_ggml_tensor(pixel_values);
-                clip_vision->compute(n_threads, pixel_values, &c_crossattn, work_ctx);
-                // print_ggml_tensor(c_crossattn);
-            }
-        }
-
-        // c_concat
-        struct ggml_tensor* c_concat = NULL;
-        {
-            if (force_zero_embeddings) {
-                c_concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 4, 1);
-                ggml_set_f32(c_concat, 0.f);
             } else {
-                ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-
-                if (width != init_image.width || height != init_image.height) {
-                    sd_image_f32_t image         = sd_image_t_to_sd_image_f32_t(init_image);
-                    sd_image_f32_t resized_image = resize_sd_image_f32_t(image, width, height);
-                    free(image.data);
-                    image.data = NULL;
-                    sd_image_f32_to_tensor(resized_image.data, init_img, false);
-                    free(resized_image.data);
-                    resized_image.data = NULL;
-                } else {
-                    sd_image_to_tensor(init_image.data, init_img);
-                }
-                if (augmentation_level > 0.f) {
-                    struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img);
-                    ggml_tensor_set_f32_randn(noise, rng);
-                    // encode_pixels += torch.randn_like(pixels) * augmentation_level
-                    ggml_tensor_scale(noise, augmentation_level);
-                    ggml_tensor_add(init_img, noise);
-                }
-                ggml_tensor* moments = encode_first_stage(work_ctx, init_img);
-                c_concat             = get_first_stage_encoding(work_ctx, moments);
+                sd_image_to_tensor(init_image.data, init_img);
             }
+            if (augmentation_level > 0.f) {
+                struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_img);
+                ggml_tensor_set_f32_randn(noise, rng);
+                // encode_pixels += torch.randn_like(pixels) * augmentation_level
+                ggml_tensor_scale(noise, augmentation_level);
+                ggml_tensor_add(init_img, noise);
+            }
+            ggml_tensor* moments = encode_first_stage(work_ctx, init_img);
+            c_concat             = get_first_stage_encoding(work_ctx, moments);
         }
+    }
 
-        // y
-        struct ggml_tensor* y = NULL;
-        {
-            y                            = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels());
-            int out_dim                  = 256;
-            int fps_id                   = fps - 1;
-            std::vector<float> timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level};
-            set_timestep_embedding(timesteps, y, out_dim);
-        }
-        int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0);
-        return {c_crossattn, y, c_concat};
-    }
-
-    ggml_tensor* StableDiffusionGGML::sample(ggml_context* work_ctx,
-                        ggml_tensor* init_latent,
-                        ggml_tensor* noise,
-                        SDCondition cond,
-                        SDCondition uncond,
-                        ggml_tensor* control_hint,
-                        float control_strength,
-                        float min_cfg,
-                        float cfg_scale,
-                        float guidance,
-                        float eta,
-                        sample_method_t method,
-                        const std::vector<float>& sigmas,
-                        int start_merge_step,
-                        SDCondition id_cond,
-                        std::vector<int> skip_layers ,
-                        float slg_scale              ,
-                        float skip_layer_start       ,
-                        float skip_layer_end         ,
-                        ggml_tensor* noise_mask      ) {
-        LOG_DEBUG("Sample");
-        struct ggml_init_params params;
-        size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
-        for (int i = 1; i < 4; i++) {
-            data_size *= init_latent->ne[i];
-        }
-        data_size += 1024;
-        params.mem_size       = data_size * 3;
-        params.mem_buffer     = NULL;
-        params.no_alloc       = false;
-        ggml_context* tmp_ctx = ggml_init(params);
-
-        size_t steps = sigmas.size() - 1;
-        // noise = load_tensor_from_file(work_ctx, "./rand0.bin");
-        // print_ggml_tensor(noise);
-        struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent);
-        copy_ggml_tensor(x, init_latent);
-        x = denoiser->noise_scaling(sigmas[0], noise, x);
-
-        struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);
-
-        bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
-        bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;
+    // y
+    struct ggml_tensor* y = NULL;
+    {
+        y                            = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, diffusion_model->get_adm_in_channels());
+        int out_dim                  = 256;
+        int fps_id                   = fps - 1;
+        std::vector<float> timesteps = {(float)fps_id, (float)motion_bucket_id, augmentation_level};
+        set_timestep_embedding(timesteps, y, out_dim);
+    }
+    int64_t t1 = ggml_time_ms();
+    LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0);
+    return {c_crossattn, y, c_concat};
+}
 
-        // denoise wrapper
-        struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
-        struct ggml_tensor* out_uncond = NULL;
-        struct ggml_tensor* out_skip   = NULL;
+ggml_tensor* StableDiffusionGGML::sample(ggml_context* work_ctx,
+                    ggml_tensor* init_latent,
+                    ggml_tensor* noise,
+                    SDCondition cond,
+                    SDCondition uncond,
+                    ggml_tensor* control_hint,
+                    float control_strength,
+                    float min_cfg,
+                    float cfg_scale,
+                    float guidance,
+                    float eta,
+                    sample_method_t method,
+                    const std::vector<float>& sigmas,
+                    int start_merge_step,
+                    SDCondition id_cond,
+                    std::vector<int> skip_layers ,
+                    float slg_scale              ,
+                    float skip_layer_start       ,
+                    float skip_layer_end         ,
+                    ggml_tensor* noise_mask      ) {
+    LOG_DEBUG("Sample");
+    struct ggml_init_params params;
+    size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
+    for (int i = 1; i < 4; i++) {
+        data_size *= init_latent->ne[i];
+    }
+    data_size += 1024;
+    params.mem_size       = data_size * 3;
+    params.mem_buffer     = NULL;
+    params.no_alloc       = false;
+    ggml_context* tmp_ctx = ggml_init(params);
+
+    size_t steps = sigmas.size() - 1;
+    // noise = load_tensor_from_file(work_ctx, "./rand0.bin");
+    // print_ggml_tensor(noise);
+    struct ggml_tensor* x = ggml_dup_tensor(work_ctx, init_latent);
+    copy_ggml_tensor(x, init_latent);
+    x = denoiser->noise_scaling(sigmas[0], noise, x);
+
+    struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);
+
+    bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
+    bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;
+
+    // denoise wrapper
+    struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
+    struct ggml_tensor* out_uncond = NULL;
+    struct ggml_tensor* out_skip   = NULL;
+
+    if (has_unconditioned) {
+        out_uncond = ggml_dup_tensor(work_ctx, x);
+    }
+    if (has_skiplayer) {
+        if (sd_version_is_dit(version)) {
+            out_skip = ggml_dup_tensor(work_ctx, x);
+        } else {
+            has_skiplayer = false;
+            LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
+        }
+    }
+    struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
-        if (has_unconditioned) {
-            out_uncond = ggml_dup_tensor(work_ctx, x);
+    auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
+        if (step == 1) {
+            pretty_progress(0, (int)steps, 0);
         }
-        if (has_skiplayer) {
-            if (sd_version_is_dit(version)) {
-                out_skip = ggml_dup_tensor(work_ctx, x);
-            } else {
-                has_skiplayer = false;
-                LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
-            }
+        int64_t t0 = ggml_time_us();
+
+        std::vector<float> scaling = denoiser->get_scalings(sigma);
+        GGML_ASSERT(scaling.size() == 3);
+        float c_skip = scaling[0];
+        float c_out  = scaling[1];
+        float c_in   = scaling[2];
+
+        float t = denoiser->sigma_to_t(sigma);
+        std::vector<float> timesteps_vec(x->ne[3], t);  // [N, ]
+        auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
+        std::vector<float> guidance_vec(x->ne[3], guidance);
+        auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
+
+        copy_ggml_tensor(noised_input, input);
+        // noised_input = noised_input * c_in
+        ggml_tensor_scale(noised_input, c_in);
+
+        std::vector<struct ggml_tensor*> controls;
+
+        if (control_hint != NULL) {
+            control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector);
+            controls = control_net->controls;
+            // print_ggml_tensor(controls[12]);
+            // GGML_ASSERT(0);
         }
-        struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
-
-        auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
-            if (step == 1) {
-                pretty_progress(0, (int)steps, 0);
-            }
-            int64_t t0 = ggml_time_us();
-
-            std::vector<float> scaling = denoiser->get_scalings(sigma);
-            GGML_ASSERT(scaling.size() == 3);
-            float c_skip = scaling[0];
-            float c_out  = scaling[1];
-            float c_in   = scaling[2];
 
-            float t = denoiser->sigma_to_t(sigma);
-            std::vector<float> timesteps_vec(x->ne[3], t);  // [N, ]
-            auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
-            std::vector<float> guidance_vec(x->ne[3], guidance);
-            auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
-
-            copy_ggml_tensor(noised_input, input);
-            // noised_input = noised_input * c_in
-            ggml_tensor_scale(noised_input, c_in);
-
-            std::vector<struct ggml_tensor*> controls;
+        if (start_merge_step == -1 || step <= start_merge_step) {
+            // cond
+            diffusion_model->compute(n_threads,
+                                     noised_input,
+                                     timesteps,
+                                     cond.c_crossattn,
+                                     cond.c_concat,
+                                     cond.c_vector,
+                                     guidance_tensor,
+                                     -1,
+                                     controls,
+                                     control_strength,
+                                     &out_cond);
+        } else {
+            diffusion_model->compute(n_threads,
+                                     noised_input,
+                                     timesteps,
+                                     id_cond.c_crossattn,
+                                     cond.c_concat,
+                                     id_cond.c_vector,
+                                     guidance_tensor,
+                                     -1,
+                                     controls,
+                                     control_strength,
+                                     &out_cond);
+        }
 
+        float* negative_data = NULL;
+        if (has_unconditioned) {
+            // uncond
             if (control_hint != NULL) {
-                control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector);
+                control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector);
                 controls = control_net->controls;
-                // print_ggml_tensor(controls[12]);
-                // GGML_ASSERT(0);
-            }
-
-            if (start_merge_step == -1 || step <= start_merge_step) {
-                // cond
-                diffusion_model->compute(n_threads,
-                                         noised_input,
-                                         timesteps,
-                                         cond.c_crossattn,
-                                         cond.c_concat,
-                                         cond.c_vector,
-                                         guidance_tensor,
-                                         -1,
-                                         controls,
-                                         control_strength,
-                                         &out_cond);
-            } else {
-                diffusion_model->compute(n_threads,
-                                         noised_input,
-                                         timesteps,
-                                         id_cond.c_crossattn,
-                                         cond.c_concat,
-                                         id_cond.c_vector,
-                                         guidance_tensor,
-                                         -1,
-                                         controls,
-                                         control_strength,
-                                         &out_cond);
             }
+            diffusion_model->compute(n_threads,
+                                     noised_input,
+                                     timesteps,
+                                     uncond.c_crossattn,
+                                     uncond.c_concat,
+                                     uncond.c_vector,
+                                     guidance_tensor,
+                                     -1,
+                                     controls,
+                                     control_strength,
+                                     &out_uncond);
+            negative_data = (float*)out_uncond->data;
+        }
 
-            float* negative_data = NULL;
+        int step_count         = sigmas.size();
+        bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count);
+        float* skip_layer_data = NULL;
+        if (is_skiplayer_step) {
+            LOG_DEBUG("Skipping layers at step %d\n", step);
+            // skip layer (same as conditionned)
+            diffusion_model->compute(n_threads,
+                                     noised_input,
+                                     timesteps,
+                                     cond.c_crossattn,
+                                     cond.c_concat,
+                                     cond.c_vector,
+                                     guidance_tensor,
+                                     -1,
+                                     controls,
+                                     control_strength,
+                                     &out_skip,
+                                     NULL,
+                                     skip_layers);
+            skip_layer_data = (float*)out_skip->data;
+        }
+        float* vec_denoised  = (float*)denoised->data;
+        float* vec_input     = (float*)input->data;
+        float* positive_data = (float*)out_cond->data;
+        int ne_elements      = (int)ggml_nelements(denoised);
+        for (int i = 0; i < ne_elements; i++) {
+            float latent_result = positive_data[i];
             if (has_unconditioned) {
-                // uncond
-                if (control_hint != NULL) {
-                    control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector);
-                    controls = control_net->controls;
+                // out_uncond + cfg_scale * (out_cond - out_uncond)
+                int64_t ne3 = out_cond->ne[3];
+                if (min_cfg != cfg_scale && ne3 != 1) {
+                    int64_t i3  = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
+                    float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
+                } else {
+                    latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
                 }
-                diffusion_model->compute(n_threads,
-                                         noised_input,
-                                         timesteps,
-                                         uncond.c_crossattn,
-                                         uncond.c_concat,
-                                         uncond.c_vector,
-                                         guidance_tensor,
-                                         -1,
-                                         controls,
-                                         control_strength,
-                                         &out_uncond);
-                negative_data = (float*)out_uncond->data;
             }
-
-            int step_count         = sigmas.size();
-            bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count);
-            float* skip_layer_data = NULL;
             if (is_skiplayer_step) {
-                LOG_DEBUG("Skipping layers at step %d\n", step);
-                // skip layer (same as conditionned)
-                diffusion_model->compute(n_threads,
-                                         noised_input,
-                                         timesteps,
-                                         cond.c_crossattn,
-                                         cond.c_concat,
-                                         cond.c_vector,
-                                         guidance_tensor,
-                                         -1,
-                                         controls,
-                                         control_strength,
-                                         &out_skip,
-                                         NULL,
-                                         skip_layers);
-                skip_layer_data = (float*)out_skip->data;
-            }
-            float* vec_denoised  = (float*)denoised->data;
-            float* vec_input     = (float*)input->data;
-            float* positive_data = (float*)out_cond->data;
-            int ne_elements      = (int)ggml_nelements(denoised);
-            for (int i = 0; i < ne_elements; i++) {
-                float latent_result = positive_data[i];
-                if (has_unconditioned) {
-                    // out_uncond + cfg_scale * (out_cond - out_uncond)
-                    int64_t ne3 = out_cond->ne[3];
-                    if (min_cfg != cfg_scale && ne3 != 1) {
-                        int64_t i3  = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
-                        float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
-                    } else {
-                        latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
-                    }
-                }
-                if (is_skiplayer_step) {
-                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
-                }
-                // v = latent_result, eps = latent_result
-                // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
-                vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
-            }
-            int64_t t1 = ggml_time_us();
-            if (step > 0) {
-                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
-                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
+                latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
             }
-            if (noise_mask != nullptr) {
-                for (int64_t x = 0; x < denoised->ne[0]; x++) {
-                    for (int64_t y = 0; y < denoised->ne[1]; y++) {
-                        float mask = ggml_tensor_get_f32(noise_mask, x, y);
-                        for (int64_t k = 0; k < denoised->ne[2]; k++) {
-                            float init = ggml_tensor_get_f32(init_latent, x, y, k);
-                            float den  = ggml_tensor_get_f32(denoised, x, y, k);
-                            ggml_tensor_set_f32(denoised, init + mask * (den - init), x, y, k);
-                        }
+            // v = latent_result, eps = latent_result
+            // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
+            vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
+        }
+        int64_t t1 = ggml_time_us();
+        if (step > 0) {
+            pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
+            // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
+        }
+        if (noise_mask != nullptr) {
+            for (int64_t x = 0; x < denoised->ne[0]; x++) {
+                for (int64_t y = 0; y < denoised->ne[1]; y++) {
+                    float mask = ggml_tensor_get_f32(noise_mask, x, y);
+                    for (int64_t k = 0; k < denoised->ne[2]; k++) {
+                        float init = ggml_tensor_get_f32(init_latent, x, y, k);
+                        float den  = ggml_tensor_get_f32(denoised, x, y, k);
+                        ggml_tensor_set_f32(denoised, init + mask * (den - init), x, y, k);
                     }
                 }
             }
+        }
 
-            return denoised;
-        };
+        return denoised;
+    };
 
-        sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta);
+    sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta);
 
-        x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
+    x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
 
-        if (control_net) {
-            control_net->free_control_ctx();
-            control_net->free_compute_buffer();
-        }
-        diffusion_model->free_compute_buffer();
-        return x;
-    }
-
-    // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
-    ggml_tensor* StableDiffusionGGML::get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
-        // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
-        ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
-        struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);
-        ggml_tensor_set_f32_randn(noise, rng);
-        // noise = load_tensor_from_file(work_ctx, "noise.bin");
-        {
-            float mean   = 0;
-            float logvar = 0;
-            float value  = 0;
-            float std_   = 0;
-            for (int i = 0; i < latent->ne[3]; i++) {
-                for (int j = 0; j < latent->ne[2]; j++) {
-                    for (int k = 0; k < latent->ne[1]; k++) {
-                        for (int l = 0; l < latent->ne[0]; l++) {
-                            mean   = ggml_tensor_get_f32(moments, l, k, j, i);
-                            logvar = ggml_tensor_get_f32(moments, l, k, j + (int)latent->ne[2], i);
-                            logvar = std::max(-30.0f, std::min(logvar, 20.0f));
-                            std_   = std::exp(0.5f * logvar);
-                            value  = mean + std_ * ggml_tensor_get_f32(noise, l, k, j, i);
-                            value  = value * scale_factor;
-                            // printf("%d %d %d %d -> %f\n", i, j, k, l, value);
-                            ggml_tensor_set_f32(latent, value, l, k, j, i);
-                        }
-                    }
-                }
-            }
-        }
-        return latent;
+    if (control_net) {
+        control_net->free_control_ctx();
+        control_net->free_compute_buffer();
     }
+    diffusion_model->free_compute_buffer();
+    return x;
+}
 
-    ggml_tensor* StableDiffusionGGML::compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode) {
-        int64_t W = x->ne[0];
-        int64_t H = x->ne[1];
-        int64_t C = 8;
-        if (use_tiny_autoencoder) {
-            C = 4;
-        } else {
-            if (sd_version_is_sd3(version)) {
-                C = 32;
-            } else if (sd_version_is_flux(version)) {
-                C = 32;
-            }
-        }
-        ggml_tensor* result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
-                                                 decode ? (W * 8) : (W / 8),  // width
-                                                 decode ? (H * 8) : (H / 8),  // height
-                                                 decode ? 3 : C,
-                                                 x->ne[3]);  // channels
-        int64_t t0          = ggml_time_ms();
-        if (!use_tiny_autoencoder) {
-            if (decode) {
-                ggml_tensor_scale(x, 1.0f / scale_factor);
-            } else {
-                ggml_tensor_scale_input(x);
-            }
-            if (vae_tiling && decode) {  // TODO: support tiling vae encode
-                // split latent in 32x32 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    first_stage_model->compute(n_threads, in, decode, &out);
-                };
-                sd_tiling(x, result, 8, 32, 0.5f, on_tiling);
-            } else {
-                first_stage_model->compute(n_threads, x, decode, &result);
-            }
-            first_stage_model->free_compute_buffer();
-            if (decode) {
-                ggml_tensor_scale_output(result);
-            }
-        } else {
-            if (vae_tiling && decode) {  // TODO: support tiling vae encode
-                // split latent in 64x64 tiles and compute in several steps
-                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    tae_first_stage->compute(n_threads, in, decode, &out);
-                };
-                sd_tiling(x, result, 8, 64, 0.5f, on_tiling);
-            } else {
-                tae_first_stage->compute(n_threads, x, decode, &result);
-            }
-            tae_first_stage->free_compute_buffer();
-        }
+// ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
+ggml_tensor* StableDiffusionGGML::get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
+    return vae_sample(work_ctx, moments, this->scale_factor, this->rng);
+}
 
-        int64_t t1 = ggml_time_ms();
-        LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000);
-        if (decode) {
-            ggml_tensor_clamp(result, 0.0f, 1.0f);
-        }
-        return result;
-    }
+ggml_tensor* StableDiffusionGGML::compute_first_stage(ggml_context* work_ctx, ggml_tensor* x, bool decode)
+{
+assert(!use_tiny_autoencoder);
+return vae_run(
+    this->first_stage_model.get(),
+    work_ctx,
+    x,
+    decode,
+    this->version,
+    this->scale_factor,
+    this->vae_tiling,
+    this->n_threads);
+}
 
-    ggml_tensor* StableDiffusionGGML::encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
-        return compute_first_stage(work_ctx, x, false);
-    }
+ggml_tensor* StableDiffusionGGML::encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
+    return compute_first_stage(work_ctx, x, false);
+}
 
-    ggml_tensor* StableDiffusionGGML::decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
-        return compute_first_stage(work_ctx, x, true);
-    }
+ggml_tensor* StableDiffusionGGML::decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
+    return compute_first_stage(work_ctx, x, true);
+}
 
 
 
diff --git a/stable-diffusion.hpp b/stable-diffusion.hpp
index f4373a840..e79769b46 100644
--- a/stable-diffusion.hpp
+++ b/stable-diffusion.hpp
@@ -22,6 +22,21 @@
 
 /*================================================== Helper Functions ================================================*/
 
+ggml_tensor* vae_sample(ggml_context* work_ctx,
+    ggml_tensor* moments,
+    float scale_factor,
+    std::shared_ptr<RNG> rng);
+
+ggml_tensor* vae_run(
+    AutoEncoderKL *first_stage_model,
+    ggml_context* work_ctx,
+    ggml_tensor* x,
+    bool decode,
+    SDVersion version,
+    float scale_factor,
+    bool vae_tiling,
+    int n_threads);
+
 void calculate_alphas_cumprod(float* alphas_cumprod,
                               float linear_start = 0.00085f,
                               float linear_end   = 0.0120,

From f53bd06b4edaba363f3aa9f746a82c826d283848 Mon Sep 17 00:00:00 2001
From: "Harris M. Snyder" <harris.snyder@gmail.com>
Date: Sun, 8 Jun 2025 15:59:53 -0400
Subject: [PATCH 5/5] cleanup cpp/hpp split

---
 stable-diffusion.cpp | 15 +++++++++++++++
 stable-diffusion.hpp | 12 +-----------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 1f124849a..c86d84100 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -6,6 +6,21 @@
 #include "stb_image.h"
 
 
+void calculate_alphas_cumprod(float* alphas_cumprod,
+                              float linear_start,
+                              float linear_end,
+                              int timesteps) {
+    float ls_sqrt = sqrtf(linear_start);
+    float le_sqrt = sqrtf(linear_end);
+    float amount  = le_sqrt - ls_sqrt;
+    float product = 1.0f;
+    for (int i = 0; i < timesteps; i++) {
+        float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
+        product *= 1.0f - powf(beta, 2.0f);
+        alphas_cumprod[i] = product;
+    }
+}
+
 // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
 ggml_tensor* vae_sample(ggml_context* work_ctx, ggml_tensor* moments, float scale_factor, std::shared_ptr<RNG> rng)
 {
diff --git a/stable-diffusion.hpp b/stable-diffusion.hpp
index e79769b46..773447ca2 100644
--- a/stable-diffusion.hpp
+++ b/stable-diffusion.hpp
@@ -40,17 +40,7 @@ ggml_tensor* vae_run(
 void calculate_alphas_cumprod(float* alphas_cumprod,
                               float linear_start = 0.00085f,
                               float linear_end   = 0.0120,
-                              int timesteps      = TIMESTEPS) {
-    float ls_sqrt = sqrtf(linear_start);
-    float le_sqrt = sqrtf(linear_end);
-    float amount  = le_sqrt - ls_sqrt;
-    float product = 1.0f;
-    for (int i = 0; i < timesteps; i++) {
-        float beta = ls_sqrt + amount * ((float)i / (timesteps - 1));
-        product *= 1.0f - powf(beta, 2.0f);
-        alphas_cumprod[i] = product;
-    }
-}
+                              int timesteps      = TIMESTEPS);
 
 struct StableDiffusionLoadConfiguration
 {