From e8ac336d91b855675e06b15a600a945e0e48da48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:43 +0200
Subject: [PATCH 01/45] fast latent image preview

---
 examples/cli/main.cpp | 124 +++++++++++++++++++++++++++++++++++++++++-
 stable-diffusion.cpp  |  19 +++++--
 stable-diffusion.h    |   4 +-
 3 files changed, 139 insertions(+), 8 deletions(-)
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index ec04dfde3..ef7877aa1 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -9,6 +9,9 @@
 #include <string>
 #include <vector>
 
+#include "model.h"
+
+
 // #include "preprocessing.hpp"
 #include "stable-diffusion.h"
 
@@ -752,6 +755,125 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
     fflush(out_stream);
 }
 
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
+const float flux_latent_rgb_proj[16][3] = {
+    {-0.0346, 0.0244, 0.0681},
+    {0.0034, 0.0210, 0.0687},
+    {0.0275, -0.0668, -0.0433},
+    {-0.0174, 0.0160, 0.0617},
+    {0.0859, 0.0721, 0.0329},
+    {0.0004, 0.0383, 0.0115},
+    {0.0405, 0.0861, 0.0915},
+    {-0.0236, -0.0185, -0.0259},
+    {-0.0245, 0.0250, 0.1180},
+    {0.1008, 0.0755, -0.0421},
+    {-0.0515, 0.0201, 0.0011},
+    {0.0428, -0.0012, -0.0036},
+    {0.0817, 0.0765, 0.0749},
+    {-0.1264, -0.0522, -0.1103},
+    {-0.0280, -0.0881, -0.0499},
+    {-0.1262, -0.0982, -0.0778}};
+
+// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
+const float sd3_latent_rgb_proj[16][3] = {
+    {-0.0645, 0.0177, 0.1052},
+    {0.0028, 0.0312, 0.0650},
+    {0.1848, 0.0762, 0.0360},
+    {0.0944, 0.0360, 0.0889},
+    {0.0897, 0.0506, -0.0364},
+    {-0.0020, 0.1203, 0.0284},
+    {0.0855, 0.0118, 0.0283},
+    {-0.0539, 0.0658, 0.1047},
+    {-0.0057, 0.0116, 0.0700},
+    {-0.0412, 0.0281, -0.0039},
+    {0.1106, 0.1171, 0.1220},
+    {-0.0248, 0.0682, -0.0481},
+    {0.0815, 0.0846, 0.1207},
+    {-0.0120, -0.0055, -0.0867},
+    {-0.0749, -0.0634, -0.0456},
+    {-0.1418, -0.1457, -0.1259},
+};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
+const float sdxl_latent_rgb_proj[4][3] = {
+    {0.3651, 0.4232, 0.4341},
+    {-0.2533, -0.0042, 0.1068},
+    {0.1076, 0.1111, -0.0362},
+    {-0.3165, -0.2492, -0.2188}};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
+const float sd_latent_rgb_proj[4][3]{
+    {0.3512, 0.2297, 0.3227},
+    {0.3250, 0.4974, 0.2350},
+    {-0.2829, 0.1762, 0.2721},
+    {-0.2120, -0.2616, -0.7177}};
+
+void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version) {
+    const int channel = 3;
+    int width         = latents->ne[0];
+    int height        = latents->ne[1];
+    int dim           = latents->ne[2];
+
+    const float (*latent_rgb_proj)[channel];
+
+    if (dim == 16) {
+        // 16 channels VAE -> Flux or SD3
+
+        if (sd_version_is_sd3(version)) {
+            latent_rgb_proj = sd3_latent_rgb_proj;
+        } else if (sd_version_is_flux(version)) {
+            latent_rgb_proj = flux_latent_rgb_proj;
+        } else {
+            // unknown model
+            return;
+        }
+
+    } else if (dim == 4) {
+        // 4 channels VAE
+        if (version == VERSION_SDXL) {
+            latent_rgb_proj = sdxl_latent_rgb_proj;
+        } else if (version == VERSION_SD1 || version == VERSION_SD2) {
+            latent_rgb_proj = sd_latent_rgb_proj;
+        } else {
+            // unknown model
+            return;
+        }
+    } else {
+        // unknown latent space
+        return;
+    }
+    uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t));
+    int data_head = 0;
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; i++) {
+            int latent_id = (i * latents->nb[0] + j * latents->nb[1]);
+            float r = 0, g = 0, b = 0;
+            for (int d = 0; d < dim; d++) {
+                float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]);
+                r += value * latent_rgb_proj[d][0];
+                g += value * latent_rgb_proj[d][1];
+                b += value * latent_rgb_proj[d][2];
+            }
+
+            // change range
+            r = r * .5 + .5;
+            g = g * .5 + .5;
+            b = b * .5 + .5;
+
+            // clamp rgb values to [0,1] range
+            r = r >= 0 ? r <= 1 ? r : 1 : 0;
+            g = g >= 0 ? g <= 1 ? g : 1 : 0;
+            b = b >= 0 ? b <= 1 ? b : 1 : 0;
+
+            data[data_head++] = (uint8_t)(r * 255.);
+            data[data_head++] = (uint8_t)(g * 255.);
+            data[data_head++] = (uint8_t)(b * 255.);
+        }
+    }
+    stbi_write_png("latent-preview.png", width, height, channel, data, 0);
+    free(data);
+}
+
 int main(int argc, const char* argv[]) {
     SDParams params;
 
@@ -993,7 +1115,7 @@ int main(int argc, const char* argv[]) {
             params.input_id_images_path.c_str(),
         };
 
-        results              = generate_image(sd_ctx, &img_gen_params);
+        results              = generate_image(sd_ctx, &img_gen_params, &step_callback);
         expected_num_results = params.batch_count;
     } else if (params.mode == VID_GEN) {
         sd_vid_gen_params_t vid_gen_params = {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index c5448f927..432b194d2 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -864,7 +864,8 @@ class StableDiffusionGGML {
                         int start_merge_step,
                         SDCondition id_cond,
                         std::vector<ggml_tensor*> ref_latents = {},
-                        ggml_tensor* denoise_mask             = nullptr) {
+                        ggml_tensor* denoise_mask             = nullptr,
+                           std::function<void(int, ggml_tensor*, SDVersion)> step_callback = nullptr) {
         std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
 
         float cfg_scale     = guidance.txt_cfg;
@@ -1096,6 +1097,9 @@ class StableDiffusionGGML {
                 }
             }
 
+            if (step_callback != nullptr) {
+                step_callback(step, denoised, version);
+            }
             return denoised;
         };
 
@@ -1544,8 +1548,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                     bool normalize_input,
                                     std::string input_id_images_path,
                                     std::vector<ggml_tensor*> ref_latents,
-                                    ggml_tensor* concat_latent = NULL,
-                                    ggml_tensor* denoise_mask  = NULL) {
+                                    ggml_tensor* concat_latent                                      = NULL,
+                                    ggml_tensor* denoise_mask                                       = NULL,
+                                    std::function<void(int, ggml_tensor*, SDVersion)> step_callback = nullptr) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1814,7 +1819,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                                      start_merge_step,
                                                      id_cond,
                                                      ref_latents,
-                                                     denoise_mask);
+                                                     denoise_mask,
+                                                     step_callback);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
@@ -1888,7 +1894,7 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
     return init_latent;
 }
 
-sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
+sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params,step_callback_t step_callback) {
     int width  = sd_img_gen_params->width;
     int height = sd_img_gen_params->height;
     if (sd_version_is_dit(sd_ctx->sd->version)) {
@@ -2097,7 +2103,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                                         sd_img_gen_params->input_id_images_path,
                                                         ref_latents,
                                                         concat_latent,
-                                                        denoise_mask);
+                                                        denoise_mask,
+                                                        step_callback);
 
     size_t t2 = ggml_time_ms();
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index e87ac2ce2..7cced3759 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -228,9 +228,11 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
 SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
 
+typedef void (*step_callback_t)(int, struct ggml_tensor*, enum SDVersion);
+
 SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
 SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
-SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
+SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, step_callback_t step_callback);
 
 SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
 SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params);  // broken

From de9c49291f42af0d9d007cef859afa06a5e1665b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:43 +0200
Subject: [PATCH 02/45] fix posix compile

---
 examples/cli/main.cpp | 2 +-
 stable-diffusion.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index ef7877aa1..1a27f8848 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -1115,7 +1115,7 @@ int main(int argc, const char* argv[]) {
             params.input_id_images_path.c_str(),
         };
 
-        results              = generate_image(sd_ctx, &img_gen_params, &step_callback);
+        results              = generate_image(sd_ctx, &img_gen_params, (step_callback_t)step_callback);
         expected_num_results = params.batch_count;
     } else if (params.mode == VID_GEN) {
         sd_vid_gen_params_t vid_gen_params = {
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 7cced3759..8ffafc1f7 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -228,7 +228,7 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
 SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
 
-typedef void (*step_callback_t)(int, struct ggml_tensor*, enum SDVersion);
+typedef void (*step_callback_t)(int, struct ggml_tensor*, int);
 
 SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
 SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);

From ee4aef89f1b10b0ad8b1d030357cc0df3beb74b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:44 +0200
Subject: [PATCH 03/45] move latent preview code to a separate file

---
 examples/cli/main.cpp | 83 +++----------------------------------------
 latent-preview.h      | 83 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 79 deletions(-)
 create mode 100644 latent-preview.h

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 1a27f8848..9e1084e3e 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -15,6 +15,8 @@
 // #include "preprocessing.hpp"
 #include "stable-diffusion.h"
 
+#include "latent-preview.h"
+
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
 #include "stb_image.h"
@@ -755,59 +757,6 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
     fflush(out_stream);
 }
 
-// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
-const float flux_latent_rgb_proj[16][3] = {
-    {-0.0346, 0.0244, 0.0681},
-    {0.0034, 0.0210, 0.0687},
-    {0.0275, -0.0668, -0.0433},
-    {-0.0174, 0.0160, 0.0617},
-    {0.0859, 0.0721, 0.0329},
-    {0.0004, 0.0383, 0.0115},
-    {0.0405, 0.0861, 0.0915},
-    {-0.0236, -0.0185, -0.0259},
-    {-0.0245, 0.0250, 0.1180},
-    {0.1008, 0.0755, -0.0421},
-    {-0.0515, 0.0201, 0.0011},
-    {0.0428, -0.0012, -0.0036},
-    {0.0817, 0.0765, 0.0749},
-    {-0.1264, -0.0522, -0.1103},
-    {-0.0280, -0.0881, -0.0499},
-    {-0.1262, -0.0982, -0.0778}};
-
-// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
-const float sd3_latent_rgb_proj[16][3] = {
-    {-0.0645, 0.0177, 0.1052},
-    {0.0028, 0.0312, 0.0650},
-    {0.1848, 0.0762, 0.0360},
-    {0.0944, 0.0360, 0.0889},
-    {0.0897, 0.0506, -0.0364},
-    {-0.0020, 0.1203, 0.0284},
-    {0.0855, 0.0118, 0.0283},
-    {-0.0539, 0.0658, 0.1047},
-    {-0.0057, 0.0116, 0.0700},
-    {-0.0412, 0.0281, -0.0039},
-    {0.1106, 0.1171, 0.1220},
-    {-0.0248, 0.0682, -0.0481},
-    {0.0815, 0.0846, 0.1207},
-    {-0.0120, -0.0055, -0.0867},
-    {-0.0749, -0.0634, -0.0456},
-    {-0.1418, -0.1457, -0.1259},
-};
-
-// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
-const float sdxl_latent_rgb_proj[4][3] = {
-    {0.3651, 0.4232, 0.4341},
-    {-0.2533, -0.0042, 0.1068},
-    {0.1076, 0.1111, -0.0362},
-    {-0.3165, -0.2492, -0.2188}};
-
-// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
-const float sd_latent_rgb_proj[4][3]{
-    {0.3512, 0.2297, 0.3227},
-    {0.3250, 0.4974, 0.2350},
-    {-0.2829, 0.1762, 0.2721},
-    {-0.2120, -0.2616, -0.7177}};
-
 void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version) {
     const int channel = 3;
     int width         = latents->ne[0];
@@ -843,33 +792,9 @@ void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version
         return;
     }
     uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t));
-    int data_head = 0;
-    for (int j = 0; j < height; j++) {
-        for (int i = 0; i < width; i++) {
-            int latent_id = (i * latents->nb[0] + j * latents->nb[1]);
-            float r = 0, g = 0, b = 0;
-            for (int d = 0; d < dim; d++) {
-                float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]);
-                r += value * latent_rgb_proj[d][0];
-                g += value * latent_rgb_proj[d][1];
-                b += value * latent_rgb_proj[d][2];
-            }
-
-            // change range
-            r = r * .5 + .5;
-            g = g * .5 + .5;
-            b = b * .5 + .5;
-
-            // clamp rgb values to [0,1] range
-            r = r >= 0 ? r <= 1 ? r : 1 : 0;
-            g = g >= 0 ? g <= 1 ? g : 1 : 0;
-            b = b >= 0 ? b <= 1 ? b : 1 : 0;
+    
+    preview_latent_image(data, latents, latent_rgb_proj, width, height, dim);
 
-            data[data_head++] = (uint8_t)(r * 255.);
-            data[data_head++] = (uint8_t)(g * 255.);
-            data[data_head++] = (uint8_t)(b * 255.);
-        }
-    }
     stbi_write_png("latent-preview.png", width, height, channel, data, 0);
     free(data);
 }
diff --git a/latent-preview.h b/latent-preview.h
new file mode 100644
index 000000000..5457c47ed
--- /dev/null
+++ b/latent-preview.h
@@ -0,0 +1,83 @@
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
+const float flux_latent_rgb_proj[16][3] = {
+    {-0.0346, 0.0244, 0.0681},
+    {0.0034, 0.0210, 0.0687},
+    {0.0275, -0.0668, -0.0433},
+    {-0.0174, 0.0160, 0.0617},
+    {0.0859, 0.0721, 0.0329},
+    {0.0004, 0.0383, 0.0115},
+    {0.0405, 0.0861, 0.0915},
+    {-0.0236, -0.0185, -0.0259},
+    {-0.0245, 0.0250, 0.1180},
+    {0.1008, 0.0755, -0.0421},
+    {-0.0515, 0.0201, 0.0011},
+    {0.0428, -0.0012, -0.0036},
+    {0.0817, 0.0765, 0.0749},
+    {-0.1264, -0.0522, -0.1103},
+    {-0.0280, -0.0881, -0.0499},
+    {-0.1262, -0.0982, -0.0778}};
+
+// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
+const float sd3_latent_rgb_proj[16][3] = {
+    {-0.0645, 0.0177, 0.1052},
+    {0.0028, 0.0312, 0.0650},
+    {0.1848, 0.0762, 0.0360},
+    {0.0944, 0.0360, 0.0889},
+    {0.0897, 0.0506, -0.0364},
+    {-0.0020, 0.1203, 0.0284},
+    {0.0855, 0.0118, 0.0283},
+    {-0.0539, 0.0658, 0.1047},
+    {-0.0057, 0.0116, 0.0700},
+    {-0.0412, 0.0281, -0.0039},
+    {0.1106, 0.1171, 0.1220},
+    {-0.0248, 0.0682, -0.0481},
+    {0.0815, 0.0846, 0.1207},
+    {-0.0120, -0.0055, -0.0867},
+    {-0.0749, -0.0634, -0.0456},
+    {-0.1418, -0.1457, -0.1259},
+};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
+const float sdxl_latent_rgb_proj[4][3] = {
+    {0.3651, 0.4232, 0.4341},
+    {-0.2533, -0.0042, 0.1068},
+    {0.1076, 0.1111, -0.0362},
+    {-0.3165, -0.2492, -0.2188}};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
+const float sd_latent_rgb_proj[4][3]{
+    {0.3512, 0.2297, 0.3227},
+    {0.3250, 0.4974, 0.2350},
+    {-0.2829, 0.1762, 0.2721},
+    {-0.2120, -0.2616, -0.7177}};
+
+void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) {
+    size_t buffer_head = 0;
+    for (int j = 0; j < height; j++) {
+        for (int i = 0; i < width; i++) {
+            size_t latent_id = (i * latents->nb[0] + j * latents->nb[1]);
+            float r = 0, g = 0, b = 0;
+            for (int d = 0; d < dim; d++) {
+                float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]);
+                r += value * latent_rgb_proj[d][0];
+                g += value * latent_rgb_proj[d][1];
+                b += value * latent_rgb_proj[d][2];
+            }
+
+            // change range
+            r = r * .5f + .5f;
+            g = g * .5f + .5f;
+            b = b * .5f + .5f;
+
+            // clamp rgb values to [0,1] range
+            r = r >= 0 ? r <= 1 ? r : 1 : 0;
+            g = g >= 0 ? g <= 1 ? g : 1 : 0;
+            b = b >= 0 ? b <= 1 ? b : 1 : 0;
+
+            buffer[buffer_head++] = (uint8_t)(r * 255);
+            buffer[buffer_head++] = (uint8_t)(g * 255);
+            buffer[buffer_head++] = (uint8_t)(b * 255);
+        }
+    }
+}
\ No newline at end of file

From 75a9abdf709aeff84dc92e33f7df4e979fedf055 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:46 +0200
Subject: [PATCH 04/45] Latent preview support for img2img and img2vid

---
 examples/cli/main.cpp | 2 +-
 stable-diffusion.cpp  | 9 ++++++---
 stable-diffusion.h    | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 9e1084e3e..168b1eb66 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -1058,7 +1058,7 @@ int main(int argc, const char* argv[]) {
             params.augmentation_level,
         };
 
-        results              = generate_video(sd_ctx, &vid_gen_params);
+        results              = generate_video(sd_ctx, &vid_gen_params, (step_callback_t)step_callback);
         expected_num_results = params.video_frames;
     }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 432b194d2..645387b83 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1894,7 +1894,7 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
     return init_latent;
 }
 
-sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params,step_callback_t step_callback) {
+sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, step_callback_t step_callback) {
     int width  = sd_img_gen_params->width;
     int height = sd_img_gen_params->height;
     if (sd_version_is_dit(sd_ctx->sd->version)) {
@@ -2113,7 +2113,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     return result_images;
 }
 
-SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) {
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, step_callback_t step_callback) {
     if (sd_ctx == NULL || sd_vid_gen_params == NULL) {
         return NULL;
     }
@@ -2195,7 +2195,10 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                                  sd_vid_gen_params->sample_method,
                                                  sigmas,
                                                  -1,
-                                                 SDCondition(NULL, NULL, NULL));
+                                                 SDCondition(NULL, NULL, NULL),
+                                                 {},
+                                                 NULL,
+                                                 step_callback);
 
     int64_t t2 = ggml_time_ms();
     LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 8ffafc1f7..16b8ac3c4 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -235,7 +235,7 @@ SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_para
 SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, step_callback_t step_callback);
 
 SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
-SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params);  // broken
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, step_callback_t step_callback);  // broken
 
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 

From 8dcb814059f932eb460acdc954955dc168988510 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:48 +0200
Subject: [PATCH 05/45] add latent-preview to .gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 38fe570df..2e520df2c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,5 @@ test/
 *.gguf
 output*.png
 models*
-*.log
\ No newline at end of file
+*.log
+latent-preview.png

From ef6207882ce5c461f8ecc8df1102f4db842ef792 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:49 +0200
Subject: [PATCH 06/45] Refactor latent preview + support tae/vae preview

---
 .gitignore            |   2 +-
 examples/cli/main.cpp |  87 +++++++++++++--------------
 stable-diffusion.cpp  | 137 +++++++++++++++++++++++++++++++++++++++---
 stable-diffusion.h    |  13 +++-
 4 files changed, 182 insertions(+), 57 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2e520df2c..552d5673c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,4 @@ test/
 output*.png
 models*
 *.log
-latent-preview.png
+preview.png
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 168b1eb66..fea609424 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -9,14 +9,9 @@
 #include <string>
 #include <vector>
 
-#include "model.h"
-
-
 // #include "preprocessing.hpp"
 #include "stable-diffusion.h"
 
-#include "latent-preview.h"
-
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
 #include "stb_image.h"
@@ -39,6 +34,13 @@ const char* modes_str[] = {
 };
 #define SD_ALL_MODES_STR "img_gen, vid_gen, convert"
 
+const char* previews_str[] = {
+    "none",
+    "proj",
+    "tae",
+    "vae",
+};
+
 enum SDMode {
     IMG_GEN,
     VID_GEN,
@@ -116,6 +118,11 @@ struct SDParams {
     bool chroma_use_dit_mask = true;
     bool chroma_use_t5_mask  = false;
     int chroma_t5_mask_pad   = 1;
+
+    sd_preview_policy_t preview_method = SD_PREVIEW_NONE;
+    int preview_interval               = 1;
+    std::string preview_path           = "preview.png";
+    bool taesd_preview                 = false;
 };
 
 void print_params(SDParams params) {
@@ -399,6 +406,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"-o", "--output", "", &params.output_path},
         {"-p", "--prompt", "", &params.prompt},
         {"-n", "--negative-prompt", "", &params.negative_prompt},
+        {"", "--preview-path", "", &params.preview_path},
 
         {"", "--upscale-model", "", &params.esrgan_path},
     };
@@ -412,6 +420,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--clip-skip", "", &params.clip_skip},
         {"-b", "--batch-count", "", &params.batch_count},
         {"", "--chroma-t5-mask-pad", "", &params.chroma_t5_mask_pad},
+        {"", "--preview-interval", "", &params.preview_interval},
     };
 
     options.float_options = {
@@ -442,6 +451,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--color", "", true, &params.color},
         {"", "--chroma-disable-dit-mask", "", false, &params.chroma_use_dit_mask},
         {"", "--chroma-enable-t5-mask", "", true, &params.chroma_use_t5_mask},
+        {"", "--taesd-preview-only", "", false, &params.taesd_preview},
     };
 
     auto on_mode_arg = [&](int argc, const char** argv, int index) {
@@ -572,6 +582,26 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         return 1;
     };
 
+    auto on_preview_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* preview = argv[index];
+        int preview_method  = -1;
+        for (int m = 0; m < N_PREVIEWS; m++) {
+            if (!strcmp(preview, previews_str[m])) {
+                preview_method = m;
+            }
+        }
+        if (preview_method == -1) {
+            fprintf(stderr, "error: preview method %s\n",
+                preview);
+            return -1;
+        }
+        params.preview_method = (sd_preview_policy_t)preview_method;
+        return 1;
+    };
+
     options.manual_options = {
         {"-M", "--mode", "", on_mode_arg},
         {"", "--type", "", on_type_arg},
@@ -582,6 +612,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--skip-layers", "", on_skip_layers_arg},
         {"-r", "--ref-image", "", on_ref_image_arg},
         {"-h", "--help", "", on_help_arg},
+        {"", "--preview", "", on_preview_arg},
     };
 
     if (!parse_options(argc, argv, options)) {
@@ -757,52 +788,17 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
     fflush(out_stream);
 }
 
-void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version) {
-    const int channel = 3;
-    int width         = latents->ne[0];
-    int height        = latents->ne[1];
-    int dim           = latents->ne[2];
-
-    const float (*latent_rgb_proj)[channel];
-
-    if (dim == 16) {
-        // 16 channels VAE -> Flux or SD3
-
-        if (sd_version_is_sd3(version)) {
-            latent_rgb_proj = sd3_latent_rgb_proj;
-        } else if (sd_version_is_flux(version)) {
-            latent_rgb_proj = flux_latent_rgb_proj;
-        } else {
-            // unknown model
-            return;
-        }
-
-    } else if (dim == 4) {
-        // 4 channels VAE
-        if (version == VERSION_SDXL) {
-            latent_rgb_proj = sdxl_latent_rgb_proj;
-        } else if (version == VERSION_SD1 || version == VERSION_SD2) {
-            latent_rgb_proj = sd_latent_rgb_proj;
-        } else {
-            // unknown model
-            return;
-        }
-    } else {
-        // unknown latent space
-        return;
-    }
-    uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t));
-    
-    preview_latent_image(data, latents, latent_rgb_proj, width, height, dim);
+const char* preview_path;
 
-    stbi_write_png("latent-preview.png", width, height, channel, data, 0);
-    free(data);
+void step_callback(int step, sd_image_t image) {
+    stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0);
 }
 
 int main(int argc, const char* argv[]) {
     SDParams params;
 
     parse_args(argc, argv, params);
+    preview_path = params.preview_path.c_str();
 
     sd_guidance_params_t guidance_params = {params.cfg_scale,
                                             params.img_cfg_scale,
@@ -958,6 +954,7 @@ int main(int argc, const char* argv[]) {
         params.control_net_cpu,
         params.vae_on_cpu,
         params.diffusion_flash_attn,
+        params.taesd_preview,
         params.diffusion_conv_direct,
         params.vae_conv_direct,
         params.chroma_use_dit_mask,
@@ -1040,7 +1037,7 @@ int main(int argc, const char* argv[]) {
             params.input_id_images_path.c_str(),
         };
 
-        results              = generate_image(sd_ctx, &img_gen_params, (step_callback_t)step_callback);
+        results              = generate_image(sd_ctx, &img_gen_params, params.preview_method, params.preview_interval,(step_callback_t)step_callback);
         expected_num_results = params.batch_count;
     } else if (params.mode == VID_GEN) {
         sd_vid_gen_params_t vid_gen_params = {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 645387b83..ba6eeedf4 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -20,6 +20,8 @@
 #define STB_IMAGE_STATIC
 #include "stb_image.h"
 
+#include "latent-preview.h"
+
 // #define STB_IMAGE_WRITE_IMPLEMENTATION
 // #define STB_IMAGE_WRITE_STATIC
 // #include "stb_image_write.h"
@@ -386,7 +388,7 @@ class StableDiffusionGGML {
             diffusion_model->alloc_params_buffer();
             diffusion_model->get_param_tensors(tensors);
 
-            if (!use_tiny_autoencoder) {
+            if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
                 if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
                     LOG_INFO("VAE Autoencoder: Using CPU backend");
                     vae_backend = ggml_backend_cpu_init();
@@ -405,7 +407,8 @@ class StableDiffusionGGML {
                 }
                 first_stage_model->alloc_params_buffer();
                 first_stage_model->get_param_tensors(tensors, "first_stage_model");
-            } else {
+            }
+            if (use_tiny_autoencoder) {
                 tae_first_stage = std::make_shared<TinyAutoEncoder>(backend,
                                                                     model_loader.tensor_storages_types,
                                                                     "decoder.layers",
@@ -509,9 +512,10 @@ class StableDiffusionGGML {
             size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size();
             size_t unet_params_mem_size = diffusion_model->get_params_buffer_size();
             size_t vae_params_mem_size  = 0;
-            if (!use_tiny_autoencoder) {
+            if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
                 vae_params_mem_size = first_stage_model->get_params_buffer_size();
-            } else {
+            }
+            if (use_tiny_autoencoder) {
                 if (!tae_first_stage->load_from_file(taesd_path)) {
                     return false;
                 }
@@ -663,6 +667,7 @@ class StableDiffusionGGML {
 
         LOG_DEBUG("finished loaded file");
         ggml_free(ctx);
+        use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only;
         return true;
     }
 
@@ -848,6 +853,100 @@ class StableDiffusionGGML {
         LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0);
         return {c_crossattn, y, c_concat};
     }
+void preview_image(ggml_context* work_ctx,
+                       int step,
+                       struct ggml_tensor* latents,
+                       enum SDVersion version,
+                       sd_preview_policy_t preview_mode,
+                       ggml_tensor* result,
+                       std::function<void(int, sd_image_t)> step_callback) {
+        const size_t channel = 3;
+        size_t width         = latents->ne[0];
+        size_t height        = latents->ne[1];
+        size_t dim           = latents->ne[2];
+        if (preview_mode == SD_PREVIEW_PROJ) {
+            const float (*latent_rgb_proj)[channel];
+
+            if (dim == 16) {
+                // 16 channels VAE -> Flux or SD3
+
+                if (sd_version_is_sd3(version)) {
+                    latent_rgb_proj = sd3_latent_rgb_proj;
+                } else if (sd_version_is_flux(version)) {
+                    latent_rgb_proj = flux_latent_rgb_proj;
+                } else {
+                    // unknown model
+                    return;
+                }
+
+            } else if (dim == 4) {
+                // 4 channels VAE
+                if (version == VERSION_SDXL) {
+                    latent_rgb_proj = sdxl_latent_rgb_proj;
+                } else if (version == VERSION_SD1 || version == VERSION_SD2) {
+                    latent_rgb_proj = sd_latent_rgb_proj;
+                } else {
+                    // unknown model
+                    return;
+                }
+            } else {
+                // unknown latent space
+                return;
+            }
+            uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t));
+
+            preview_latent_image(data, latents, latent_rgb_proj, width, height, dim);
+            sd_image_t image = {
+                width,
+                height,
+                channel,
+                data};
+            step_callback(step, image);
+            free(image.data);
+        } else {
+            if (preview_mode == SD_PREVIEW_VAE) {
+                ggml_tensor_scale(latents, 1.0f / scale_factor);
+                if (vae_tiling) {
+                    // split latent in 32x32 tiles and compute in several steps
+                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+                        first_stage_model->compute(n_threads, in, true, &out);
+                    };
+                    sd_tiling(latents, result, 8, 32, 0.5f, on_tiling);
+                } else {
+                    first_stage_model->compute(n_threads, latents, true, &result);
+                }
+                first_stage_model->free_compute_buffer();
+
+                ggml_tensor_scale_output(result);
+            } else if (preview_mode == SD_PREVIEW_TAE) {
+                if (tae_first_stage == nullptr) {
+                    LOG_WARN("TAE not found for preview");
+                    return;
+                }
+                if (vae_tiling) {
+                    // split latent in 64x64 tiles and compute in several steps
+                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+                        tae_first_stage->compute(n_threads, in, true, &out);
+                    };
+                    sd_tiling(latents, result, 8, 64, 0.5f, on_tiling);
+                } else {
+                    tae_first_stage->compute(n_threads, latents, true, &result);
+                }
+                tae_first_stage->free_compute_buffer();
+            } else {
+                return;
+            }
+            ggml_tensor_clamp(result, 0.0f, 1.0f);
+            sd_image_t image = {
+                width * 8,
+                height * 8,
+                channel,
+                sd_tensor_to_image(result)};
+            ggml_tensor_scale(result, 0);
+            step_callback(step, image);
+            free(image.data);
+        }
+    }
 
     ggml_tensor* sample(ggml_context* work_ctx,
                         ggml_tensor* init_latent,
@@ -865,7 +964,9 @@ class StableDiffusionGGML {
                         SDCondition id_cond,
                         std::vector<ggml_tensor*> ref_latents = {},
                         ggml_tensor* denoise_mask             = nullptr,
-                           std::function<void(int, ggml_tensor*, SDVersion)> step_callback = nullptr) {
+                        sd_preview_policy_t preview_mode                   = SD_PREVIEW_PROJ,
+                        int preview_interval                               = 1,
+                        std::function<void(int, sd_image_t)> step_callback = nullptr) {
         std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
 
         float cfg_scale     = guidance.txt_cfg;
@@ -926,6 +1027,15 @@ class StableDiffusionGGML {
         }
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
+        struct ggml_tensor* preview_tensor = NULL;
+        if (preview_mode != SD_PREVIEW_PROJ) {
+            preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
+                                                (denoised->ne[0] * 8),
+                                                (denoised->ne[1] * 8),
+                                                3,
+                                                denoised->ne[3]);
+        }
+
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
             if (step == 1) {
                 pretty_progress(0, (int)steps, 0);
@@ -1098,7 +1208,9 @@ class StableDiffusionGGML {
             }
 
             if (step_callback != nullptr) {
-                step_callback(step, denoised, version);
+                if (step % preview_interval == 0) {
+                    preview_image(work_ctx, step, denoised, version, preview_mode, preview_tensor, step_callback);
+                }
             }
             return denoised;
         };
@@ -1550,7 +1662,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                     std::vector<ggml_tensor*> ref_latents,
                                     ggml_tensor* concat_latent                                      = NULL,
                                     ggml_tensor* denoise_mask                                       = NULL,
-                                    std::function<void(int, ggml_tensor*, SDVersion)> step_callback = nullptr) {
+                                    sd_preview_policy_t preview_mode                   = SD_PREVIEW_PROJ,
+                                    int preview_interval                               = 1,
+                                    std::function<void(int, sd_image_t)> step_callback = nullptr) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1820,6 +1934,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                                      id_cond,
                                                      ref_latents,
                                                      denoise_mask,
+                                                     preview_mode,
+                                                     preview_interval,
                                                      step_callback);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
@@ -1894,7 +2010,7 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
     return init_latent;
 }
 
-sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, step_callback_t step_callback) {
+sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, sd_preview_policy_t preview_mode, int preview_interval, step_callback_t step_callback) {
     int width  = sd_img_gen_params->width;
     int height = sd_img_gen_params->height;
     if (sd_version_is_dit(sd_ctx->sd->version)) {
@@ -2104,6 +2220,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                                         ref_latents,
                                                         concat_latent,
                                                         denoise_mask,
+                                                        preview_mode,
+                                                        preview_interval,
                                                         step_callback);
 
     size_t t2 = ggml_time_ms();
@@ -2198,7 +2316,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                                  SDCondition(NULL, NULL, NULL),
                                                  {},
                                                  NULL,
-                                                 step_callback);
+                                                 (sd_preview_policy_t)0, 1,
+                                                 NULL);
 
     int64_t t2 = ggml_time_ms();
     LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 16b8ac3c4..d7522a46e 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -111,6 +111,14 @@ enum sd_log_level_t {
     SD_LOG_ERROR
 };
 
+enum sd_preview_policy_t {
+    SD_PREVIEW_NONE,
+    SD_PREVIEW_PROJ,
+    SD_PREVIEW_TAE,
+    SD_PREVIEW_VAE,
+    N_PREVIEWS
+};
+
 typedef struct {
     const char* model_path;
     const char* clip_l_path;
@@ -134,6 +142,7 @@ typedef struct {
     bool keep_control_net_on_cpu;
     bool keep_vae_on_cpu;
     bool diffusion_flash_attn;
+    bool tae_preview_only;
     bool diffusion_conv_direct;
     bool vae_conv_direct;
     bool chroma_use_dit_mask;
@@ -228,11 +237,11 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
 SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
 
-typedef void (*step_callback_t)(int, struct ggml_tensor*, int);
+typedef void (*step_callback_t)(int, sd_image_t);
 
 SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
 SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
-SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, step_callback_t step_callback);
+SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, sd_preview_policy_t preview_mode, int preview_interval, step_callback_t step_callback);
 
 SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
 SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, step_callback_t step_callback);  // broken

From 2cedeb569e639fe282835df5e0efb83e11ee69ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:50 +0200
Subject: [PATCH 07/45] update usage

---
 examples/cli/main.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index fea609424..3e7242926 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -182,6 +182,8 @@ void print_params(SDParams params) {
     printf("    chroma_use_dit_mask:   %s\n", params.chroma_use_dit_mask ? "true" : "false");
     printf("    chroma_use_t5_mask:    %s\n", params.chroma_use_t5_mask ? "true" : "false");
     printf("    chroma_t5_mask_pad:    %d\n", params.chroma_t5_mask_pad);
+    printf("    preview_mode:      %d\n", previews_str[params.preview_method]);
+    printf("    preview_interval:  %d\n", params.preview_interval);
 }
 
 void print_usage(int argc, const char* argv[]) {
@@ -198,7 +200,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --clip_g                           path to the clip-g text encoder\n");
     printf("  --t5xxl                            path to the t5xxl text encoder\n");
     printf("  --vae [VAE]                        path to vae\n");
-    printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
+    printf("  --taesd [TAESD]                    path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
+    printf("  --taesd-preview-only               prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[SD_PREVIEW_TAE]);
     printf("  --control-net [CONTROL_PATH]       path to control net model\n");
     printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings\n");
     printf("  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings\n");
@@ -254,6 +257,10 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
+    printf("  --preview {%s,%s,%s,%s}            preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[SD_PREVIEW_NONE]);
+    printf("                                     %s is the fastest\n", previews_str[SD_PREVIEW_PROJ]);
+    printf("  --preview-interval [N]             How often to save the image preview");
+    printf("  --preview-path [PATH}              path to write preview image to (default: ./preview.png)\n");
     printf("  --color                            colors the logging tags according to level\n");
     printf("  --chroma-disable-dit-mask          disable dit mask for chroma\n");
     printf("  --chroma-enable-t5-mask            enable t5 mask for chroma\n");

From be0a442cefb201d344a6e7fd6c2e58ab35fdfd67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:51 +0200
Subject: [PATCH 08/45] Fix build + add warning

---
 stable-diffusion.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index ba6eeedf4..d0b776687 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -860,10 +860,10 @@ void preview_image(ggml_context* work_ctx,
                        sd_preview_policy_t preview_mode,
                        ggml_tensor* result,
                        std::function<void(int, sd_image_t)> step_callback) {
-        const size_t channel = 3;
-        size_t width         = latents->ne[0];
-        size_t height        = latents->ne[1];
-        size_t dim           = latents->ne[2];
+        const uint32_t channel = 3;
+        uint32_t width         = latents->ne[0];
+        uint32_t height        = latents->ne[1];
+        uint32_t dim           = latents->ne[2];
         if (preview_mode == SD_PREVIEW_PROJ) {
             const float (*latent_rgb_proj)[channel];
 
@@ -875,6 +875,7 @@ void preview_image(ggml_context* work_ctx,
                 } else if (sd_version_is_flux(version)) {
                     latent_rgb_proj = flux_latent_rgb_proj;
                 } else {
+                    LOG_WARN("No latent to RGB projection known for this model");
                     // unknown model
                     return;
                 }
@@ -887,9 +888,11 @@ void preview_image(ggml_context* work_ctx,
                     latent_rgb_proj = sd_latent_rgb_proj;
                 } else {
                     // unknown model
+                    LOG_WARN("No latent to RGB projection known for this model");
                     return;
                 }
             } else {
+                LOG_WARN("No latent to RGB projection known for this model");
                 // unknown latent space
                 return;
             }

From 31b0fdd01aa008d1801535446ef7c20842777ec8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:53 +0200
Subject: [PATCH 09/45] Disable preview by default in sdcpp too

---
 stable-diffusion.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index d0b776687..e197ee84d 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -967,7 +967,7 @@ void preview_image(ggml_context* work_ctx,
                         SDCondition id_cond,
                         std::vector<ggml_tensor*> ref_latents = {},
                         ggml_tensor* denoise_mask             = nullptr,
-                        sd_preview_policy_t preview_mode                   = SD_PREVIEW_PROJ,
+                        sd_preview_policy_t preview_mode                   = SD_PREVIEW_NONE,
                         int preview_interval                               = 1,
                         std::function<void(int, sd_image_t)> step_callback = nullptr) {
         std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
@@ -1665,7 +1665,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                     std::vector<ggml_tensor*> ref_latents,
                                     ggml_tensor* concat_latent                                      = NULL,
                                     ggml_tensor* denoise_mask                                       = NULL,
-                                    sd_preview_policy_t preview_mode                   = SD_PREVIEW_PROJ,
+                                    sd_preview_policy_t preview_mode                   = SD_PREVIEW_NONE,
                                     int preview_interval                               = 1,
                                     std::function<void(int, sd_image_t)> step_callback = nullptr) {
     if (seed < 0) {

From 95fd31ccb7e639bbd0130bc32e7ff20cc8515e6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:54 +0200
Subject: [PATCH 10/45] Done not preload preview tensor when preview is
 disabled.

---
 stable-diffusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index e197ee84d..b8c841cd2 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1031,7 +1031,7 @@ void preview_image(ggml_context* work_ctx,
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
         struct ggml_tensor* preview_tensor = NULL;
-        if (preview_mode != SD_PREVIEW_PROJ) {
+        if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) {
             preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
                                                 (denoised->ne[0] * 8),
                                                 (denoised->ne[1] * 8),

From cbd8c996fad13039f254d70740ee10ab01d4cd9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:55 +0200
Subject: [PATCH 11/45] Fix VAE preview darkening

---
 stable-diffusion.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index b8c841cd2..77c329293 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -919,6 +919,7 @@ void preview_image(ggml_context* work_ctx,
                     first_stage_model->compute(n_threads, latents, true, &result);
                 }
                 first_stage_model->free_compute_buffer();
+                ggml_tensor_scale(latents, scale_factor);
 
                 ggml_tensor_scale_output(result);
             } else if (preview_mode == SD_PREVIEW_TAE) {

From c3d72c04bc7267195e9727420c0edd4a2f22bea4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:56 +0200
Subject: [PATCH 12/45] Increase context memory when loading multiple auto
 encoders

---
 stable-diffusion.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 77c329293..4e777a2d7 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -2042,6 +2042,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     if (sd_ctx->sd->stacked_id) {
         params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
     }
+    if (sd_ctx->sd->first_stage_model != nullptr && sd_ctx->sd->tae_first_stage != nullptr) {
+        params.mem_size *= 2;
+    }
     params.mem_size += width * height * 3 * sizeof(float) * 3;
     params.mem_size += width * height * 3 * sizeof(float) * 3 * sd_img_gen_params->ref_images_count;
     params.mem_size *= sd_img_gen_params->batch_count;

From 8059ac343d7cda24cf3c9666c00606dcd2e81b44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:58 +0200
Subject: [PATCH 13/45] Increase context memory when previewing with auto
 encoder instead

---
 stable-diffusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 4e777a2d7..4be0c9b76 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -2042,7 +2042,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     if (sd_ctx->sd->stacked_id) {
         params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
     }
-    if (sd_ctx->sd->first_stage_model != nullptr && sd_ctx->sd->tae_first_stage != nullptr) {
+    if (preview_mode!=SD_PREVIEW_NONE && preview_mode!=SD_PREVIEW_PROJ) {
         params.mem_size *= 2;
     }
     params.mem_size += width * height * 3 * sizeof(float) * 3;

From 8e6024f81a89b2c36f325bf5cec8af880d25c067 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:13:59 +0200
Subject: [PATCH 14/45] fix compile warnings

---
 latent-preview.h | 80 ++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/latent-preview.h b/latent-preview.h
index 5457c47ed..ca4d132f3 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -1,56 +1,56 @@
 
 // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
 const float flux_latent_rgb_proj[16][3] = {
-    {-0.0346, 0.0244, 0.0681},
-    {0.0034, 0.0210, 0.0687},
-    {0.0275, -0.0668, -0.0433},
-    {-0.0174, 0.0160, 0.0617},
-    {0.0859, 0.0721, 0.0329},
-    {0.0004, 0.0383, 0.0115},
-    {0.0405, 0.0861, 0.0915},
-    {-0.0236, -0.0185, -0.0259},
-    {-0.0245, 0.0250, 0.1180},
-    {0.1008, 0.0755, -0.0421},
-    {-0.0515, 0.0201, 0.0011},
-    {0.0428, -0.0012, -0.0036},
-    {0.0817, 0.0765, 0.0749},
-    {-0.1264, -0.0522, -0.1103},
-    {-0.0280, -0.0881, -0.0499},
-    {-0.1262, -0.0982, -0.0778}};
+    {-0.0346f, 0.0244f, 0.0681f},
+    {0.0034f, 0.0210f, 0.0687f},
+    {0.0275f, -0.0668f, -0.0433f},
+    {-0.0174f, 0.0160f, 0.0617f},
+    {0.0859f, 0.0721f, 0.0329f},
+    {0.0004f, 0.0383f, 0.0115f},
+    {0.0405f, 0.0861f, 0.0915f},
+    {-0.0236f, -0.0185f, -0.0259f},
+    {-0.0245f, 0.0250f, 0.1180f},
+    {0.1008f, 0.0755f, -0.0421f},
+    {-0.0515f, 0.0201f, 0.0011f},
+    {0.0428f, -0.0012f, -0.0036f},
+    {0.0817f, 0.0765f, 0.0749f},
+    {-0.1264f, -0.0522f, -0.1103f},
+    {-0.0280f, -0.0881f, -0.0499f},
+    {-0.1262f, -0.0982f, -0.0778f}};
 
 // https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
 const float sd3_latent_rgb_proj[16][3] = {
-    {-0.0645, 0.0177, 0.1052},
-    {0.0028, 0.0312, 0.0650},
-    {0.1848, 0.0762, 0.0360},
-    {0.0944, 0.0360, 0.0889},
-    {0.0897, 0.0506, -0.0364},
-    {-0.0020, 0.1203, 0.0284},
-    {0.0855, 0.0118, 0.0283},
-    {-0.0539, 0.0658, 0.1047},
-    {-0.0057, 0.0116, 0.0700},
-    {-0.0412, 0.0281, -0.0039},
-    {0.1106, 0.1171, 0.1220},
-    {-0.0248, 0.0682, -0.0481},
-    {0.0815, 0.0846, 0.1207},
-    {-0.0120, -0.0055, -0.0867},
-    {-0.0749, -0.0634, -0.0456},
-    {-0.1418, -0.1457, -0.1259},
+    {-0.0645f, 0.0177f, 0.1052f},
+    {0.0028f, 0.0312f, 0.0650f},
+    {0.1848f, 0.0762f, 0.0360f},
+    {0.0944f, 0.0360f, 0.0889f},
+    {0.0897f, 0.0506f, -0.0364f},
+    {-0.0020f, 0.1203f, 0.0284f},
+    {0.0855f, 0.0118f, 0.0283f},
+    {-0.0539f, 0.0658f, 0.1047f},
+    {-0.0057f, 0.0116f, 0.0700f},
+    {-0.0412f, 0.0281f, -0.0039f},
+    {0.1106f, 0.1171f, 0.1220f},
+    {-0.0248f, 0.0682f, -0.0481f},
+    {0.0815f, 0.0846f, 0.1207f},
+    {-0.0120f, -0.0055f, -0.0867f},
+    {-0.0749f, -0.0634f, -0.0456f},
+    {-0.1418f, -0.1457f, -0.1259f},
 };
 
 // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
 const float sdxl_latent_rgb_proj[4][3] = {
-    {0.3651, 0.4232, 0.4341},
-    {-0.2533, -0.0042, 0.1068},
-    {0.1076, 0.1111, -0.0362},
-    {-0.3165, -0.2492, -0.2188}};
+    {0.3651f, 0.4232f, 0.4341f},
+    {-0.2533f, -0.0042f, 0.1068f},
+    {0.1076f, 0.1111f, -0.0362f},
+    {-0.3165f, -0.2492f, -0.2188f}};
 
 // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
 const float sd_latent_rgb_proj[4][3]{
-    {0.3512, 0.2297, 0.3227},
-    {0.3250, 0.4974, 0.2350},
-    {-0.2829, 0.1762, 0.2721},
-    {-0.2120, -0.2616, -0.7177}};
+    {0.3512f, 0.2297f, 0.3227f},
+    {0.3250f, 0.4974f, 0.2350f},
+    {-0.2829f, 0.1762f, 0.2721f},
+    {-0.2120f, -0.2616f, -0.7177f}};
 
 void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) {
     size_t buffer_head = 0;

From 19ac567924ccbdfe0a238a1ce7fa9f64361d91f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:14:00 +0200
Subject: [PATCH 15/45] fix print-params

---
 examples/cli/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 3e7242926..a74fb6b55 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -182,7 +182,7 @@ void print_params(SDParams params) {
     printf("    chroma_use_dit_mask:   %s\n", params.chroma_use_dit_mask ? "true" : "false");
     printf("    chroma_use_t5_mask:    %s\n", params.chroma_use_t5_mask ? "true" : "false");
     printf("    chroma_t5_mask_pad:    %d\n", params.chroma_t5_mask_pad);
-    printf("    preview_mode:      %d\n", previews_str[params.preview_method]);
+    printf("    preview_mode:      %s\n", previews_str[params.preview_method]);
     printf("    preview_interval:  %d\n", params.preview_interval);
 }
 

From 430f7d8b6b0d9db88b6903e1cf42fbfad9d7f2d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:14:01 +0200
Subject: [PATCH 16/45] fix preview with unet inpaint models

---
 stable-diffusion.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 4be0c9b76..da679529f 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -882,9 +882,9 @@ void preview_image(ggml_context* work_ctx,
 
             } else if (dim == 4) {
                 // 4 channels VAE
-                if (version == VERSION_SDXL) {
+                if (sd_version_is_sdxl(version)) {
                     latent_rgb_proj = sdxl_latent_rgb_proj;
-                } else if (version == VERSION_SD1 || version == VERSION_SD2) {
+                } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
                     latent_rgb_proj = sd_latent_rgb_proj;
                 } else {
                     // unknown model
@@ -2042,7 +2042,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     if (sd_ctx->sd->stacked_id) {
         params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
     }
-    if (preview_mode!=SD_PREVIEW_NONE && preview_mode!=SD_PREVIEW_PROJ) {
+    if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) {
         params.mem_size *= 2;
     }
     params.mem_size += width * height * 3 * sizeof(float) * 3;

From 2272068b96133902bf96abd7b5b18b63dd83b4e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:14:02 +0200
Subject: [PATCH 17/45] do not spam pretty progress when using tiled vae/tae as
 preview

---
 stable-diffusion.cpp | 28 +++++++++++++++++++++++++---
 stable-diffusion.h   |  2 ++
 util.cpp             |  6 ++++++
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index da679529f..6c19e6ce2 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -71,6 +71,14 @@ void calculate_alphas_cumprod(float* alphas_cumprod,
     }
 }
 
+void suppress_pp(int step, int steps, float time, void* data) {
+    (void)step;
+    (void)steps;
+    (void)time;
+    (void)data;
+    return;
+}
+
 /*=============================================== StableDiffusionGGML ================================================*/
 
 class StableDiffusionGGML {
@@ -853,7 +861,16 @@ class StableDiffusionGGML {
         LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0);
         return {c_crossattn, y, c_concat};
     }
-void preview_image(ggml_context* work_ctx,
+
+    void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
+        sd_progress_cb_t cb = sd_get_progress_callback();
+        void* cbd           = sd_get_progress_callback_data();
+        sd_set_progress_callback((sd_progress_cb_t)suppress_pp, NULL);
+        sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing);
+        sd_set_progress_callback(cb, cbd);
+    }
+
+    void preview_image(ggml_context* work_ctx,
                        int step,
                        struct ggml_tensor* latents,
                        enum SDVersion version,
@@ -914,7 +931,8 @@ void preview_image(ggml_context* work_ctx,
                     auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                         first_stage_model->compute(n_threads, in, true, &out);
                     };
-                    sd_tiling(latents, result, 8, 32, 0.5f, on_tiling);
+                    silent_tiling(latents, result, 8, 32, 0.5f, on_tiling);
+
                 } else {
                     first_stage_model->compute(n_threads, latents, true, &result);
                 }
@@ -932,7 +950,7 @@ void preview_image(ggml_context* work_ctx,
                     auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                         tae_first_stage->compute(n_threads, in, true, &out);
                     };
-                    sd_tiling(latents, result, 8, 64, 0.5f, on_tiling);
+                    silent_tiling(latents, result, 8, 64, 0.5f, on_tiling);
                 } else {
                     tae_first_stage->compute(n_threads, latents, true, &result);
                 }
@@ -1210,6 +1228,10 @@ void preview_image(ggml_context* work_ctx,
                     }
                 }
             }
+            if (step > 0) {
+                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
+                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
+            }
 
             if (step_callback != nullptr) {
                 if (step % preview_interval == 0) {
diff --git a/stable-diffusion.h b/stable-diffusion.h
index d7522a46e..a043e1904 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -219,6 +219,8 @@ typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
 
 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
+SD_API sd_progress_cb_t sd_get_progress_callback();
+SD_API void* sd_get_progress_callback_data();
 SD_API int32_t get_num_physical_cores();
 SD_API const char* sd_get_system_info();
 
diff --git a/util.cpp b/util.cpp
index 92bc9ef50..18caf8567 100644
--- a/util.cpp
+++ b/util.cpp
@@ -420,6 +420,12 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
     sd_progress_cb      = cb;
     sd_progress_cb_data = data;
 }
+sd_progress_cb_t sd_get_progress_callback(){
+    return sd_progress_cb;
+}
+void* sd_get_progress_callback_data(){
+    return sd_progress_cb_data;
+}
 const char* sd_get_system_info() {
     static char buffer[1024];
     std::stringstream ss;

From eeca6979b104ef1ee287551a54f8c6108e6ad472 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:14:04 +0200
Subject: [PATCH 18/45] change log level of "processing %i tiles"

---
 ggml_extend.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 22dd88c94..a5c15de16 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -636,7 +636,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
     ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1);
     on_processing(input_tile, NULL, true);
     int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap);
-    LOG_INFO("processing %i tiles", num_tiles);
+    LOG_DEBUG("processing %i tiles", num_tiles);
     pretty_progress(1, num_tiles, 0.0f);
     int tile_count = 1;
     bool last_y = false, last_x = false;

From beb0e91c846aa2325966c886f29322a3e30ef023 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 10 Jul 2025 19:14:05 +0200
Subject: [PATCH 19/45] Refactor preview to match the other callbacks

---
 examples/cli/main.cpp | 15 ++++++++-------
 stable-diffusion.cpp  | 43 ++++++++++++++++---------------------------
 stable-diffusion.h    | 12 +++++-------
 util.cpp              | 25 +++++++++++++++++++++++--
 util.h                |  7 +++++++
 5 files changed, 59 insertions(+), 43 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index a74fb6b55..560565434 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -119,10 +119,10 @@ struct SDParams {
     bool chroma_use_t5_mask  = false;
     int chroma_t5_mask_pad   = 1;
 
-    sd_preview_policy_t preview_method = SD_PREVIEW_NONE;
-    int preview_interval               = 1;
-    std::string preview_path           = "preview.png";
-    bool taesd_preview                 = false;
+    sd_preview_t preview_method = SD_PREVIEW_NONE;
+    int preview_interval        = 1;
+    std::string preview_path    = "preview.png";
+    bool taesd_preview          = false;
 };
 
 void print_params(SDParams params) {
@@ -605,7 +605,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 preview);
             return -1;
         }
-        params.preview_method = (sd_preview_policy_t)preview_method;
+        params.preview_method = (sd_preview_t)preview_method;
         return 1;
     };
 
@@ -820,6 +820,7 @@ int main(int argc, const char* argv[]) {
                                             }};
 
     sd_set_log_callback(sd_log_cb, (void*)&params);
+    sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval);
 
     if (params.verbose) {
         print_params(params);
@@ -1044,7 +1045,7 @@ int main(int argc, const char* argv[]) {
             params.input_id_images_path.c_str(),
         };
 
-        results              = generate_image(sd_ctx, &img_gen_params, params.preview_method, params.preview_interval,(step_callback_t)step_callback);
+        results              = generate_image(sd_ctx, &img_gen_params);
         expected_num_results = params.batch_count;
     } else if (params.mode == VID_GEN) {
         sd_vid_gen_params_t vid_gen_params = {
@@ -1062,7 +1063,7 @@ int main(int argc, const char* argv[]) {
             params.augmentation_level,
         };
 
-        results              = generate_video(sd_ctx, &vid_gen_params, (step_callback_t)step_callback);
+        results              = generate_video(sd_ctx, &vid_gen_params);
         expected_num_results = params.video_frames;
     }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 6c19e6ce2..d40c0ee82 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -874,7 +874,7 @@ class StableDiffusionGGML {
                        int step,
                        struct ggml_tensor* latents,
                        enum SDVersion version,
-                       sd_preview_policy_t preview_mode,
+                       sd_preview_t preview_mode,
                        ggml_tensor* result,
                        std::function<void(int, sd_image_t)> step_callback) {
         const uint32_t channel = 3;
@@ -985,10 +985,7 @@ class StableDiffusionGGML {
                         int start_merge_step,
                         SDCondition id_cond,
                         std::vector<ggml_tensor*> ref_latents = {},
-                        ggml_tensor* denoise_mask             = nullptr,
-                        sd_preview_policy_t preview_mode                   = SD_PREVIEW_NONE,
-                        int preview_interval                               = 1,
-                        std::function<void(int, sd_image_t)> step_callback = nullptr) {
+                        ggml_tensor* denoise_mask             = nullptr) {
         std::vector<int> skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count);
 
         float cfg_scale     = guidance.txt_cfg;
@@ -1050,7 +1047,8 @@ class StableDiffusionGGML {
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
         struct ggml_tensor* preview_tensor = NULL;
-        if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) {
+        auto sd_preview_mode = sd_get_preview_mode();
+        if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) {
             preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
                                                 (denoised->ne[0] * 8),
                                                 (denoised->ne[1] * 8),
@@ -1232,10 +1230,11 @@ class StableDiffusionGGML {
                 pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
                 // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
             }
-
-            if (step_callback != nullptr) {
-                if (step % preview_interval == 0) {
-                    preview_image(work_ctx, step, denoised, version, preview_mode, preview_tensor, step_callback);
+            auto sd_preview_cb = sd_get_preview_callback();
+            auto sd_preview_mode = sd_get_preview_mode();
+            if (sd_preview_cb != NULL) {
+                if (step % sd_get_preview_interval() == 0) {
+                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb);
                 }
             }
             return denoised;
@@ -1687,10 +1686,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                     std::string input_id_images_path,
                                     std::vector<ggml_tensor*> ref_latents,
                                     ggml_tensor* concat_latent                                      = NULL,
-                                    ggml_tensor* denoise_mask                                       = NULL,
-                                    sd_preview_policy_t preview_mode                   = SD_PREVIEW_NONE,
-                                    int preview_interval                               = 1,
-                                    std::function<void(int, sd_image_t)> step_callback = nullptr) {
+                                    ggml_tensor* denoise_mask                                       = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1959,10 +1955,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                                      start_merge_step,
                                                      id_cond,
                                                      ref_latents,
-                                                     denoise_mask,
-                                                     preview_mode,
-                                                     preview_interval,
-                                                     step_callback);
+                                                     denoise_mask);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
@@ -2036,7 +2029,7 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx,
     return init_latent;
 }
 
-sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, sd_preview_policy_t preview_mode, int preview_interval, step_callback_t step_callback) {
+sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) {
     int width  = sd_img_gen_params->width;
     int height = sd_img_gen_params->height;
     if (sd_version_is_dit(sd_ctx->sd->version)) {
@@ -2064,7 +2057,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     if (sd_ctx->sd->stacked_id) {
         params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
     }
-    if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) {
+    auto sd_preview_mode = sd_get_preview_mode();
+    if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) {
         params.mem_size *= 2;
     }
     params.mem_size += width * height * 3 * sizeof(float) * 3;
@@ -2248,10 +2242,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
                                                         sd_img_gen_params->input_id_images_path,
                                                         ref_latents,
                                                         concat_latent,
-                                                        denoise_mask,
-                                                        preview_mode,
-                                                        preview_interval,
-                                                        step_callback);
+                                                        denoise_mask);
 
     size_t t2 = ggml_time_ms();
 
@@ -2260,7 +2251,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     return result_images;
 }
 
-SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, step_callback_t step_callback) {
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) {
     if (sd_ctx == NULL || sd_vid_gen_params == NULL) {
         return NULL;
     }
@@ -2344,8 +2335,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                                  -1,
                                                  SDCondition(NULL, NULL, NULL),
                                                  {},
-                                                 NULL,
-                                                 (sd_preview_policy_t)0, 1,
                                                  NULL);
 
     int64_t t2 = ggml_time_ms();
diff --git a/stable-diffusion.h b/stable-diffusion.h
index a043e1904..8e80d9314 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -111,7 +111,7 @@ enum sd_log_level_t {
     SD_LOG_ERROR
 };
 
-enum sd_preview_policy_t {
+enum sd_preview_t {
     SD_PREVIEW_NONE,
     SD_PREVIEW_PROJ,
     SD_PREVIEW_TAE,
@@ -216,11 +216,11 @@ typedef struct sd_ctx_t sd_ctx_t;
 
 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
+typedef void (*sd_preview_cb_t)(int, sd_image_t);
 
 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
-SD_API sd_progress_cb_t sd_get_progress_callback();
-SD_API void* sd_get_progress_callback_data();
+SD_API void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode, int interval);
 SD_API int32_t get_num_physical_cores();
 SD_API const char* sd_get_system_info();
 
@@ -239,14 +239,12 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
 SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params);
 SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
 
-typedef void (*step_callback_t)(int, sd_image_t);
-
 SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params);
 SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params);
-SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, sd_preview_policy_t preview_mode, int preview_interval, step_callback_t step_callback);
+SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params);
 
 SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params);
-SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, step_callback_t step_callback);  // broken
+SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params);  // broken
 
 typedef struct upscaler_ctx_t upscaler_ctx_t;
 
diff --git a/util.cpp b/util.cpp
index 18caf8567..cac5b1408 100644
--- a/util.cpp
+++ b/util.cpp
@@ -247,6 +247,10 @@ int32_t get_num_physical_cores() {
 static sd_progress_cb_t sd_progress_cb = NULL;
 void* sd_progress_cb_data              = NULL;
 
+static sd_preview_cb_t sd_preview_cb = NULL;
+sd_preview_t sd_preview_mode         = SD_PREVIEW_NONE;
+int sd_preview_interval              = 1;
+
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
     return converter.from_bytes(utf8_str);
@@ -420,10 +424,27 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
     sd_progress_cb      = cb;
     sd_progress_cb_data = data;
 }
-sd_progress_cb_t sd_get_progress_callback(){
+void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode = SD_PREVIEW_PROJ, int interval = 1) {
+    sd_preview_cb       = cb;
+    sd_preview_mode     = mode;
+    sd_preview_interval = interval;
+}
+
+sd_preview_cb_t sd_get_preview_callback() {
+    return sd_preview_cb;
+}
+
+sd_preview_t sd_get_preview_mode() {
+    return sd_preview_mode;
+}
+int sd_get_preview_interval() {
+    return sd_preview_interval;
+}
+
+sd_progress_cb_t sd_get_progress_callback() {
     return sd_progress_cb;
 }
-void* sd_get_progress_callback_data(){
+void* sd_get_progress_callback_data() {
     return sd_progress_cb_data;
 }
 const char* sd_get_system_info() {
diff --git a/util.h b/util.h
index d98c9a280..bbcee8905 100644
--- a/util.h
+++ b/util.h
@@ -57,6 +57,13 @@ std::string trim(const std::string& s);
 
 std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::string& text);
 
+sd_progress_cb_t sd_get_progress_callback();
+void* sd_get_progress_callback_data();
+
+sd_preview_cb_t sd_get_preview_callback();
+sd_preview_t sd_get_preview_mode();
+int sd_get_preview_interval();
+
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)

From d465a70efd10e72cd16f66c0d1f9d17b16db180d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 23 Jul 2025 17:08:12 +0200
Subject: [PATCH 20/45] preview: new API

---
 examples/cli/main.cpp | 12 ++++++------
 stable-diffusion.cpp  | 35 +++++++++++++++++++++++++++++------
 stable-diffusion.h    | 16 +++++++++-------
 util.cpp              |  6 +++---
 util.h                |  2 +-
 5 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 560565434..f63435b78 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -119,7 +119,7 @@ struct SDParams {
     bool chroma_use_t5_mask  = false;
     int chroma_t5_mask_pad   = 1;
 
-    sd_preview_t preview_method = SD_PREVIEW_NONE;
+    preview_t preview_method = PREVIEW_NONE;
     int preview_interval        = 1;
     std::string preview_path    = "preview.png";
     bool taesd_preview          = false;
@@ -201,7 +201,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --t5xxl                            path to the t5xxl text encoder\n");
     printf("  --vae [VAE]                        path to vae\n");
     printf("  --taesd [TAESD]                    path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
-    printf("  --taesd-preview-only               prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[SD_PREVIEW_TAE]);
+    printf("  --taesd-preview-only               prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[PREVIEW_TAE]);
     printf("  --control-net [CONTROL_PATH]       path to control net model\n");
     printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings\n");
     printf("  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings\n");
@@ -257,8 +257,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
-    printf("  --preview {%s,%s,%s,%s}            preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[SD_PREVIEW_NONE]);
-    printf("                                     %s is the fastest\n", previews_str[SD_PREVIEW_PROJ]);
+    printf("  --preview {%s,%s,%s,%s}            preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[PREVIEW_NONE]);
+    printf("                                     %s is the fastest\n", previews_str[PREVIEW_PROJ]);
     printf("  --preview-interval [N]             How often to save the image preview");
     printf("  --preview-path [PATH}              path to write preview image to (default: ./preview.png)\n");
     printf("  --color                            colors the logging tags according to level\n");
@@ -595,7 +595,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         }
         const char* preview = argv[index];
         int preview_method  = -1;
-        for (int m = 0; m < N_PREVIEWS; m++) {
+        for (int m = 0; m < PREVIEW_COUNT; m++) {
             if (!strcmp(preview, previews_str[m])) {
                 preview_method = m;
             }
@@ -605,7 +605,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 preview);
             return -1;
         }
-        params.preview_method = (sd_preview_t)preview_method;
+        params.preview_method = (preview_t)preview_method;
         return 1;
     };
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index d40c0ee82..6a1567735 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -874,14 +874,14 @@ class StableDiffusionGGML {
                        int step,
                        struct ggml_tensor* latents,
                        enum SDVersion version,
-                       sd_preview_t preview_mode,
+                       preview_t preview_mode,
                        ggml_tensor* result,
                        std::function<void(int, sd_image_t)> step_callback) {
         const uint32_t channel = 3;
         uint32_t width         = latents->ne[0];
         uint32_t height        = latents->ne[1];
         uint32_t dim           = latents->ne[2];
-        if (preview_mode == SD_PREVIEW_PROJ) {
+        if (preview_mode == PREVIEW_PROJ) {
             const float (*latent_rgb_proj)[channel];
 
             if (dim == 16) {
@@ -924,7 +924,7 @@ class StableDiffusionGGML {
             step_callback(step, image);
             free(image.data);
         } else {
-            if (preview_mode == SD_PREVIEW_VAE) {
+            if (preview_mode == PREVIEW_VAE) {
                 ggml_tensor_scale(latents, 1.0f / scale_factor);
                 if (vae_tiling) {
                     // split latent in 32x32 tiles and compute in several steps
@@ -940,7 +940,7 @@ class StableDiffusionGGML {
                 ggml_tensor_scale(latents, scale_factor);
 
                 ggml_tensor_scale_output(result);
-            } else if (preview_mode == SD_PREVIEW_TAE) {
+            } else if (preview_mode == PREVIEW_TAE) {
                 if (tae_first_stage == nullptr) {
                     LOG_WARN("TAE not found for preview");
                     return;
@@ -1048,7 +1048,7 @@ class StableDiffusionGGML {
 
         struct ggml_tensor* preview_tensor = NULL;
         auto sd_preview_mode = sd_get_preview_mode();
-        if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) {
+        if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
             preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
                                                 (denoised->ne[0] * 8),
                                                 (denoised->ne[1] * 8),
@@ -1447,6 +1447,29 @@ enum schedule_t str_to_schedule(const char* str) {
     return SCHEDULE_COUNT;
 }
 
+const char* preview_to_str[] = {
+    "none",
+    "proj",
+    "tae",
+    "vae",
+};
+
+const char* sd_preview_name(enum preview_t preview) {
+    if (preview < PREVIEW_COUNT) {
+        return preview_to_str[preview];
+    }
+    return NONE_STR;
+}
+
+enum preview_t str_to_preview(const char* str) {
+    for (int i = 0; i < PREVIEW_COUNT; i++) {
+        if (!strcmp(str, preview_to_str[i])) {
+            return (enum preview_t)i;
+        }
+    }
+    return PREVIEW_COUNT;
+}
+
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t));
     sd_ctx_params->vae_decode_only         = true;
@@ -2058,7 +2081,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
         params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
     }
     auto sd_preview_mode = sd_get_preview_mode();
-    if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) {
+    if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
         params.mem_size *= 2;
     }
     params.mem_size += width * height * 3 * sizeof(float) * 3;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 8e80d9314..f2be3dfef 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -111,12 +111,12 @@ enum sd_log_level_t {
     SD_LOG_ERROR
 };
 
-enum sd_preview_t {
-    SD_PREVIEW_NONE,
-    SD_PREVIEW_PROJ,
-    SD_PREVIEW_TAE,
-    SD_PREVIEW_VAE,
-    N_PREVIEWS
+enum preview_t {
+    PREVIEW_NONE,
+    PREVIEW_PROJ,
+    PREVIEW_TAE,
+    PREVIEW_VAE,
+    PREVIEW_COUNT
 };
 
 typedef struct {
@@ -220,7 +220,7 @@ typedef void (*sd_preview_cb_t)(int, sd_image_t);
 
 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
-SD_API void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode, int interval);
+SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval);
 SD_API int32_t get_num_physical_cores();
 SD_API const char* sd_get_system_info();
 
@@ -232,6 +232,8 @@ SD_API const char* sd_sample_method_name(enum sample_method_t sample_method);
 SD_API enum sample_method_t str_to_sample_method(const char* str);
 SD_API const char* sd_schedule_name(enum schedule_t schedule);
 SD_API enum schedule_t str_to_schedule(const char* str);
+SD_API const char* sd_preview_name(enum preview_t preview);
+SD_API enum preview_t str_to_preview(const char* str);
 
 SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
 SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
diff --git a/util.cpp b/util.cpp
index cac5b1408..741b77cc5 100644
--- a/util.cpp
+++ b/util.cpp
@@ -248,7 +248,7 @@ static sd_progress_cb_t sd_progress_cb = NULL;
 void* sd_progress_cb_data              = NULL;
 
 static sd_preview_cb_t sd_preview_cb = NULL;
-sd_preview_t sd_preview_mode         = SD_PREVIEW_NONE;
+preview_t sd_preview_mode         = PREVIEW_NONE;
 int sd_preview_interval              = 1;
 
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
@@ -424,7 +424,7 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
     sd_progress_cb      = cb;
     sd_progress_cb_data = data;
 }
-void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode = SD_PREVIEW_PROJ, int interval = 1) {
+void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1) {
     sd_preview_cb       = cb;
     sd_preview_mode     = mode;
     sd_preview_interval = interval;
@@ -434,7 +434,7 @@ sd_preview_cb_t sd_get_preview_callback() {
     return sd_preview_cb;
 }
 
-sd_preview_t sd_get_preview_mode() {
+preview_t sd_get_preview_mode() {
     return sd_preview_mode;
 }
 int sd_get_preview_interval() {
diff --git a/util.h b/util.h
index bbcee8905..ac32cd080 100644
--- a/util.h
+++ b/util.h
@@ -61,7 +61,7 @@ sd_progress_cb_t sd_get_progress_callback();
 void* sd_get_progress_callback_data();
 
 sd_preview_cb_t sd_get_preview_callback();
-sd_preview_t sd_get_preview_mode();
+preview_t sd_get_preview_mode();
 int sd_get_preview_interval();
 
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)

From 55ef7beb09b0dd7371a82a2cf0605aba2b77bbc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 31 Aug 2025 02:08:30 +0200
Subject: [PATCH 21/45] latent proj bias

---
 latent-preview.h     | 11 +++++++++--
 stable-diffusion.cpp |  7 ++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/latent-preview.h b/latent-preview.h
index ca4d132f3..d21700108 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -1,4 +1,3 @@
-
 // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
 const float flux_latent_rgb_proj[16][3] = {
     {-0.0346f, 0.0244f, 0.0681f},
@@ -17,6 +16,7 @@ const float flux_latent_rgb_proj[16][3] = {
     {-0.1264f, -0.0522f, -0.1103f},
     {-0.0280f, -0.0881f, -0.0499f},
     {-0.1262f, -0.0982f, -0.0778f}};
+float flux_latent_rgb_bias[3] = {-0.0329, -0.0718, -0.0851};
 
 // https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
 const float sd3_latent_rgb_proj[16][3] = {
@@ -37,6 +37,7 @@ const float sd3_latent_rgb_proj[16][3] = {
     {-0.0749f, -0.0634f, -0.0456f},
     {-0.1418f, -0.1457f, -0.1259f},
 };
+float sd3_latent_rgb_bias[3] = {0, 0, 0};
 
 // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
 const float sdxl_latent_rgb_proj[4][3] = {
@@ -44,6 +45,7 @@ const float sdxl_latent_rgb_proj[4][3] = {
     {-0.2533f, -0.0042f, 0.1068f},
     {0.1076f, 0.1111f, -0.0362f},
     {-0.3165f, -0.2492f, -0.2188f}};
+float sdxl_latent_rgb_bias[3] = {0.1084, -0.0175, -0.0011};
 
 // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
 const float sd_latent_rgb_proj[4][3]{
@@ -51,8 +53,9 @@ const float sd_latent_rgb_proj[4][3]{
     {0.3250f, 0.4974f, 0.2350f},
     {-0.2829f, 0.1762f, 0.2721f},
     {-0.2120f, -0.2616f, -0.7177f}};
+float sd_latent_rgb_bias[3] = {0,0,0};
 
-void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) {
+void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int dim) {
     size_t buffer_head = 0;
     for (int j = 0; j < height; j++) {
         for (int i = 0; i < width; i++) {
@@ -64,6 +67,10 @@ void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const fl
                 g += value * latent_rgb_proj[d][1];
                 b += value * latent_rgb_proj[d][2];
             }
+                // bias
+                r += latent_rgb_bias[0];
+                g += latent_rgb_bias[1];
+                b += latent_rgb_bias[2];
 
             // change range
             r = r * .5f + .5f;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 6a1567735..886c7428f 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -883,14 +883,17 @@ class StableDiffusionGGML {
         uint32_t dim           = latents->ne[2];
         if (preview_mode == PREVIEW_PROJ) {
             const float (*latent_rgb_proj)[channel];
+            float *latent_rgb_bias;
 
             if (dim == 16) {
                 // 16 channels VAE -> Flux or SD3
 
                 if (sd_version_is_sd3(version)) {
                     latent_rgb_proj = sd3_latent_rgb_proj;
+                    latent_rgb_bias = sd3_latent_rgb_bias;
                 } else if (sd_version_is_flux(version)) {
                     latent_rgb_proj = flux_latent_rgb_proj;
+                    latent_rgb_bias = flux_latent_rgb_bias;
                 } else {
                     LOG_WARN("No latent to RGB projection known for this model");
                     // unknown model
@@ -901,8 +904,10 @@ class StableDiffusionGGML {
                 // 4 channels VAE
                 if (sd_version_is_sdxl(version)) {
                     latent_rgb_proj = sdxl_latent_rgb_proj;
+                    latent_rgb_bias = sdxl_latent_rgb_bias;
                 } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
                     latent_rgb_proj = sd_latent_rgb_proj;
+                    latent_rgb_bias = sd_latent_rgb_bias;
                 } else {
                     // unknown model
                     LOG_WARN("No latent to RGB projection known for this model");
@@ -915,7 +920,7 @@ class StableDiffusionGGML {
             }
             uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t));
 
-            preview_latent_image(data, latents, latent_rgb_proj, width, height, dim);
+            preview_latent_image(data, latents, latent_rgb_proj,latent_rgb_bias, width, height, dim);
             sd_image_t image = {
                 width,
                 height,

From a5278ceec350216d4ab60ee3582636821d27a4ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 9 Sep 2025 13:15:27 +0200
Subject: [PATCH 22/45] fix merge issues

---
 stable-diffusion.cpp | 64 ++++++--------------------------------------
 1 file changed, 8 insertions(+), 56 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index fb6a1a251..3d9b5a653 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -729,6 +729,7 @@ class StableDiffusionGGML {
 
         LOG_DEBUG("finished loaded file");
         ggml_free(ctx);
+        use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only;
         return true;
     }
 
@@ -763,51 +764,6 @@ class StableDiffusionGGML {
                 LOG_ERROR("Unknown scheduler %i", scheduler);
                 abort();
         }
-        if (sd_ctx_params->schedule != DEFAULT) {
-            switch (sd_ctx_params->schedule) {
-                case DISCRETE:
-                    LOG_INFO("running with discrete schedule");
-                    denoiser->schedule = std::make_shared<DiscreteSchedule>();
-                    break;
-                case KARRAS:
-                    LOG_INFO("running with Karras schedule");
-                    denoiser->schedule = std::make_shared<KarrasSchedule>();
-                    break;
-                case EXPONENTIAL:
-                    LOG_INFO("running exponential schedule");
-                    denoiser->schedule = std::make_shared<ExponentialSchedule>();
-                    break;
-                case AYS:
-                    LOG_INFO("Running with Align-Your-Steps schedule");
-                    denoiser->schedule          = std::make_shared<AYSSchedule>();
-                    denoiser->schedule->version = version;
-                    break;
-                case GITS:
-                    LOG_INFO("Running with GITS schedule");
-                    denoiser->schedule          = std::make_shared<GITSSchedule>();
-                    denoiser->schedule->version = version;
-                    break;
-                case DEFAULT:
-                    // Don't touch anything.
-                    break;
-                default:
-                    LOG_ERROR("Unknown schedule %i", sd_ctx_params->schedule);
-                    abort();
-            }
-        }
-
-        auto comp_vis_denoiser = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser);
-        if (comp_vis_denoiser) {
-            for (int i = 0; i < TIMESTEPS; i++) {
-                comp_vis_denoiser->sigmas[i]     = std::sqrt((1 - ((float*)alphas_cumprod_tensor->data)[i]) / ((float*)alphas_cumprod_tensor->data)[i]);
-                comp_vis_denoiser->log_sigmas[i] = std::log(comp_vis_denoiser->sigmas[i]);
-            }
-        }
-
-        LOG_DEBUG("finished loaded file");
-        ggml_free(ctx);
-        use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only;
-        return true;
     }
 
     bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) {
@@ -1134,21 +1090,19 @@ class StableDiffusionGGML {
             free(image.data);
         } else {
             if (preview_mode == PREVIEW_VAE) {
-                ggml_tensor_scale(latents, 1.0f / scale_factor);
+                process_latent_out(latents);
                 if (vae_tiling) {
                     // split latent in 32x32 tiles and compute in several steps
                     auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                        first_stage_model->compute(n_threads, in, true, &out);
+                        first_stage_model->compute(n_threads, in, true, &out, NULL);
                     };
                     silent_tiling(latents, result, 8, 32, 0.5f, on_tiling);
 
                 } else {
-                    first_stage_model->compute(n_threads, latents, true, &result);
+                    first_stage_model->compute(n_threads, latents, true, &result, work_ctx);
                 }
                 first_stage_model->free_compute_buffer();
-                ggml_tensor_scale(latents, scale_factor);
-
-                ggml_tensor_scale_output(result);
+                process_vae_output_tensor(result);
             } else if (preview_mode == PREVIEW_TAE) {
                 if (tae_first_stage == nullptr) {
                     LOG_WARN("TAE not found for preview");
@@ -1157,11 +1111,11 @@ class StableDiffusionGGML {
                 if (vae_tiling) {
                     // split latent in 64x64 tiles and compute in several steps
                     auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                        tae_first_stage->compute(n_threads, in, true, &out);
+                        tae_first_stage->compute(n_threads, in, true, &out, NULL);
                     };
                     silent_tiling(latents, result, 8, 64, 0.5f, on_tiling);
                 } else {
-                    tae_first_stage->compute(n_threads, latents, true, &result);
+                    tae_first_stage->compute(n_threads, latents, true, &result, work_ctx);
                 }
                 tae_first_stage->free_compute_buffer();
             } else {
@@ -2884,9 +2838,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
                                           -1,
                                           {},
                                           {},
-                                          denoise_mask,
-                                                 {},
-                                                 NULL);
+                                          denoise_mask);
 
         int64_t sampling_end = ggml_time_ms();
         LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);

From 030aa3df2e4eacb86e2ed0f3e2342847f4d3244b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 30 Aug 2025 18:27:17 +0200
Subject: [PATCH 23/45] add wan latent projs

---
 latent-preview.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/latent-preview.h b/latent-preview.h
index d21700108..c9994a2bd 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -1,3 +1,71 @@
+const float wan_21_latent_rgb_proj[16][3] = {
+    {-0.1299, -0.1692, 0.2932},
+    {0.0671, 0.0406, 0.0442},
+    {0.3568, 0.2548, 0.1747},
+    {0.0372, 0.2344, 0.1420},
+    {0.0313, 0.0189, -0.0328},
+    {0.0296, -0.0956, -0.0665},
+    {-0.3477, -0.4059, -0.2925},
+    {0.0166, 0.1902, 0.1975},
+    {-0.0412, 0.0267, -0.1364},
+    {-0.1293, 0.0740, 0.1636},
+    {0.0680, 0.3019, 0.1128},
+    {0.0032, 0.0581, 0.0639},
+    {-0.1251, 0.0927, 0.1699},
+    {0.0060, -0.0633, 0.0005},
+    {0.3477, 0.2275, 0.2950},
+    {0.1984, 0.0913, 0.1861}};
+
+const float wan_22_latent_rgb_proj[48][3] = {
+    {0.0119, 0.0103, 0.0046},
+    {-0.1062, -0.0504, 0.0165},
+    {0.0140, 0.0409, 0.0491},
+    {-0.0813, -0.0677, 0.0607},
+    {0.0656, 0.0851, 0.0808},
+    {0.0264, 0.0463, 0.0912},
+    {0.0295, 0.0326, 0.0590},
+    {-0.0244, -0.0270, 0.0025},
+    {0.0443, -0.0102, 0.0288},
+    {-0.0465, -0.0090, -0.0205},
+    {0.0359, 0.0236, 0.0082},
+    {-0.0776, 0.0854, 0.1048},
+    {0.0564, 0.0264, 0.0561},
+    {0.0006, 0.0594, 0.0418},
+    {-0.0319, -0.0542, -0.0637},
+    {-0.0268, 0.0024, 0.0260},
+    {0.0539, 0.0265, 0.0358},
+    {-0.0359, -0.0312, -0.0287},
+    {-0.0285, -0.1032, -0.1237},
+    {0.1041, 0.0537, 0.0622},
+    {-0.0086, -0.0374, -0.0051},
+    {0.0390, 0.0670, 0.2863},
+    {0.0069, 0.0144, 0.0082},
+    {0.0006, -0.0167, 0.0079},
+    {0.0313, -0.0574, -0.0232},
+    {-0.1454, -0.0902, -0.0481},
+    {0.0714, 0.0827, 0.0447},
+    {-0.0304, -0.0574, -0.0196},
+    {0.0401, 0.0384, 0.0204},
+    {-0.0758, -0.0297, -0.0014},
+    {0.0568, 0.1307, 0.1372},
+    {-0.0055, -0.0310, -0.0380},
+    {0.0239, -0.0305, 0.0325},
+    {-0.0663, -0.0673, -0.0140},
+    {-0.0416, -0.0047, -0.0023},
+    {0.0166, 0.0112, -0.0093},
+    {-0.0211, 0.0011, 0.0331},
+    {0.1833, 0.1466, 0.2250},
+    {-0.0368, 0.0370, 0.0295},
+    {-0.3441, -0.3543, -0.2008},
+    {-0.0479, -0.0489, -0.0420},
+    {-0.0660, -0.0153, 0.0800},
+    {-0.0101, 0.0068, 0.0156},
+    {-0.0690, -0.0452, -0.0927},
+    {-0.0145, 0.0041, 0.0015},
+    {0.0421, 0.0451, 0.0373},
+    {0.0504, -0.0483, -0.0356},
+    {-0.0837, 0.0168, 0.0055}};
+
 // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
 const float flux_latent_rgb_proj[16][3] = {
     {-0.0346f, 0.0244f, 0.0681f},
@@ -87,4 +155,4 @@ void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const fl
             buffer[buffer_head++] = (uint8_t)(b * 255);
         }
     }
-}
\ No newline at end of file
+}

From 4c536b5fd3cef01a8f39ab7a1a878b741a78acd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 31 Aug 2025 01:33:58 +0200
Subject: [PATCH 24/45] animated previews

---
 examples/cli/main.cpp |  37 ++++++++---
 latent-preview.h      |  46 ++++++-------
 stable-diffusion.cpp  | 147 ++++++++++++++++++++++++++++--------------
 stable-diffusion.h    |   2 +-
 4 files changed, 154 insertions(+), 78 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 25d8a578b..75a4e1d26 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -127,9 +127,9 @@ struct SDParams {
     float flow_shift         = INFINITY;
 
     preview_t preview_method = PREVIEW_NONE;
-    int preview_interval        = 1;
-    std::string preview_path    = "preview.png";
-    bool taesd_preview          = false;
+    int preview_interval     = 1;
+    std::string preview_path = "preview.png";
+    bool taesd_preview       = false;
 
     SDParams() {
         sd_sample_params_init(&sample_params);
@@ -298,7 +298,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --preview {%s,%s,%s,%s}            preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[PREVIEW_NONE]);
     printf("                                     %s is the fastest\n", previews_str[PREVIEW_PROJ]);
     printf("  --preview-interval [N]             How often to save the image preview");
-    printf("  --preview-path [PATH}              path to write preview image to (default: ./preview.png)\n");
+    printf("  --preview-path [PATH]              path to write preview image to (default: ./preview.png)\n");
     printf("  --color                            colors the logging tags according to level\n");
     printf("  --chroma-disable-dit-mask          disable dit mask for chroma\n");
     printf("  --chroma-enable-t5-mask            enable t5 mask for chroma\n");
@@ -506,7 +506,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"-p", "--prompt", "", &params.prompt},
         {"-n", "--negative-prompt", "", &params.negative_prompt},
         {"", "--preview-path", "", &params.preview_path},
-
         {"", "--upscale-model", "", &params.esrgan_path},
     };
 
@@ -762,7 +761,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         }
         if (preview_method == -1) {
             fprintf(stderr, "error: preview method %s\n",
-                preview);
+                    preview);
             return -1;
         }
         params.preview_method = (preview_t)preview_method;
@@ -1065,15 +1064,37 @@ uint8_t* load_image(const char* image_path, int& width, int& height, int expecte
 }
 
 const char* preview_path;
+float preview_fps;
 
-void step_callback(int step, sd_image_t image) {
-    stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0);
+void step_callback(int step, int frame_count, sd_image_t* image) {
+    if (frame_count == 1) {
+        stbi_write_png(preview_path, image->width, image->height, image->channel, image->data, 0);
+    } else {
+        create_mjpg_avi_from_sd_images(preview_path, image, frame_count, preview_fps);
+    }
 }
 
 int main(int argc, const char* argv[]) {
     SDParams params;
     parse_args(argc, argv, params);
     preview_path = params.preview_path.c_str();
+    if (params.video_frames > 4) {
+        size_t last_dot_pos   = params.preview_path.find_last_of(".");
+        std::string base_path = params.preview_path;
+        std::string file_ext  = "";
+        if (last_dot_pos != std::string::npos) {  // filename has extension
+            base_path = params.preview_path.substr(0, last_dot_pos);
+            file_ext  = params.preview_path.substr(last_dot_pos);
+            std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower);
+        }
+        if (file_ext == ".png") {
+            preview_path = (base_path + ".avi").c_str();
+        }
+    }
+    preview_fps = params.fps;
+    if (params.preview_method == PREVIEW_PROJ)
+        preview_fps /= 4.0f;
+
     params.sample_params.guidance.slg.layers                 = params.skip_layers.data();
     params.sample_params.guidance.slg.layer_count            = params.skip_layers.size();
     params.high_noise_sample_params.guidance.slg.layers      = params.high_noise_skip_layers.data();
diff --git a/latent-preview.h b/latent-preview.h
index c9994a2bd..97be36e0a 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -123,36 +123,38 @@ const float sd_latent_rgb_proj[4][3]{
     {-0.2120f, -0.2616f, -0.7177f}};
 float sd_latent_rgb_bias[3] = {0,0,0};
 
-void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int dim) {
+void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
     size_t buffer_head = 0;
-    for (int j = 0; j < height; j++) {
-        for (int i = 0; i < width; i++) {
-            size_t latent_id = (i * latents->nb[0] + j * latents->nb[1]);
-            float r = 0, g = 0, b = 0;
-            for (int d = 0; d < dim; d++) {
-                float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]);
-                r += value * latent_rgb_proj[d][0];
-                g += value * latent_rgb_proj[d][1];
-                b += value * latent_rgb_proj[d][2];
-            }
+    for (int k = 0; k < frames; k++) {
+        for (int j = 0; j < height; j++) {
+            for (int i = 0; i < width; i++) {
+                size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
+                float r = 0, g = 0, b = 0;
+                for (int d = 0; d < dim; d++) {
+                    float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
+                    r += value * latent_rgb_proj[d][0];
+                    g += value * latent_rgb_proj[d][1];
+                    b += value * latent_rgb_proj[d][2];
+                }
                 // bias
                 r += latent_rgb_bias[0];
                 g += latent_rgb_bias[1];
                 b += latent_rgb_bias[2];
 
-            // change range
-            r = r * .5f + .5f;
-            g = g * .5f + .5f;
-            b = b * .5f + .5f;
+                // change range
+                r = r * .5f + .5f;
+                g = g * .5f + .5f;
+                b = b * .5f + .5f;
 
-            // clamp rgb values to [0,1] range
-            r = r >= 0 ? r <= 1 ? r : 1 : 0;
-            g = g >= 0 ? g <= 1 ? g : 1 : 0;
-            b = b >= 0 ? b <= 1 ? b : 1 : 0;
+                // clamp rgb values to [0,1] range
+                r = r >= 0 ? r <= 1 ? r : 1 : 0;
+                g = g >= 0 ? g <= 1 ? g : 1 : 0;
+                b = b >= 0 ? b <= 1 ? b : 1 : 0;
 
-            buffer[buffer_head++] = (uint8_t)(r * 255);
-            buffer[buffer_head++] = (uint8_t)(g * 255);
-            buffer[buffer_head++] = (uint8_t)(b * 255);
+                buffer[buffer_head++] = (uint8_t)(r * 255);
+                buffer[buffer_head++] = (uint8_t)(g * 255);
+                buffer[buffer_head++] = (uint8_t)(b * 255);
+            }
         }
     }
 }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 3d9b5a653..b69ddea87 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -361,8 +361,8 @@ class StableDiffusionGGML {
                                                                      offload_params_to_cpu,
                                                                      model_loader.tensor_storages_types);
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               model_loader.tensor_storages_types);
+                                                                offload_params_to_cpu,
+                                                                model_loader.tensor_storages_types);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : model_loader.tensor_storages_types) {
@@ -398,11 +398,11 @@ class StableDiffusionGGML {
                                                                     1,
                                                                     true);
                 diffusion_model  = std::make_shared<WanModel>(backend,
-                                                             offload_params_to_cpu,
-                                                             model_loader.tensor_storages_types,
-                                                             "model.diffusion_model",
-                                                             version,
-                                                             sd_ctx_params->diffusion_flash_attn);
+                                                              offload_params_to_cpu,
+                                                              model_loader.tensor_storages_types,
+                                                              "model.diffusion_model",
+                                                              version,
+                                                              sd_ctx_params->diffusion_flash_attn);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                     high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                             offload_params_to_cpu,
@@ -1036,16 +1036,25 @@ class StableDiffusionGGML {
                        enum SDVersion version,
                        preview_t preview_mode,
                        ggml_tensor* result,
-                       std::function<void(int, sd_image_t)> step_callback) {
+                       std::function<void(int, int, sd_image_t*)> step_callback) {
         const uint32_t channel = 3;
         uint32_t width         = latents->ne[0];
         uint32_t height        = latents->ne[1];
-        uint32_t dim           = latents->ne[2];
+        uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
+
         if (preview_mode == PREVIEW_PROJ) {
             const float (*latent_rgb_proj)[channel];
             float *latent_rgb_bias;
 
-            if (dim == 16) {
+            if (dim == 48) {
+                if (sd_version_is_wan(version)) {
+                    latent_rgb_proj = wan_22_latent_rgb_proj;
+                } else {
+                    LOG_WARN("No latent to RGB projection known for this model");
+                    // unknown model
+                    return;
+                }
+            } else if (dim == 16) {
                 // 16 channels VAE -> Flux or SD3
 
                 if (sd_version_is_sd3(version)) {
@@ -1053,6 +1062,8 @@ class StableDiffusionGGML {
                     latent_rgb_bias = sd3_latent_rgb_bias;
                 } else if (sd_version_is_flux(version)) {
                     latent_rgb_proj = flux_latent_rgb_proj;
+                } else if (sd_version_is_wan(version)) {
+                    latent_rgb_proj = wan_21_latent_rgb_proj;
                     latent_rgb_bias = flux_latent_rgb_bias;
                 } else {
                     LOG_WARN("No latent to RGB projection known for this model");
@@ -1078,16 +1089,22 @@ class StableDiffusionGGML {
                 // unknown latent space
                 return;
             }
-            uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t));
 
-            preview_latent_image(data, latents, latent_rgb_proj,latent_rgb_bias, width, height, dim);
-            sd_image_t image = {
-                width,
-                height,
-                channel,
-                data};
-            step_callback(step, image);
-            free(image.data);
+            uint32_t frames = 1;
+            if (ggml_n_dims(latents) == 4) {
+                frames = latents->ne[2];
+            }
+
+            uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
+
+            preview_latent_video(data, latents, latent_rgb_proj,latent_rgb_bias, width, height, frames, dim);
+            sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
+            for (int i = 0; i < frames; i++) {
+                images[i] = {width, height, channel, data + i * width * height * channel};
+            }
+            step_callback(step, frames, images);
+            free(data);
+            free(images);
         } else {
             if (preview_mode == PREVIEW_VAE) {
                 process_latent_out(latents);
@@ -1101,8 +1118,10 @@ class StableDiffusionGGML {
                 } else {
                     first_stage_model->compute(n_threads, latents, true, &result, work_ctx);
                 }
+
                 first_stage_model->free_compute_buffer();
                 process_vae_output_tensor(result);
+                process_latent_in(latents);
             } else if (preview_mode == PREVIEW_TAE) {
                 if (tae_first_stage == nullptr) {
                     LOG_WARN("TAE not found for preview");
@@ -1121,15 +1140,30 @@ class StableDiffusionGGML {
             } else {
                 return;
             }
+
             ggml_tensor_clamp(result, 0.0f, 1.0f);
-            sd_image_t image = {
-                width * 8,
-                height * 8,
-                channel,
-                sd_tensor_to_image(result)};
+            uint32_t frames = 1;
+            if (ggml_n_dims(latents) == 4) {
+                frames = result->ne[2];
+            }
+
+            sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
+            print_ggml_tensor(result,true);
+            for (size_t i = 0; i < frames; i++) {
+                images[i].width   = result->ne[0];
+                images[i].height  = result->ne[1];
+                images[i].channel = 3;
+                images[i].data    = sd_tensor_to_image(result, i, ggml_n_dims(latents) == 4);
+            }
+
+            step_callback(step, frames, images);
+            
             ggml_tensor_scale(result, 0);
-            step_callback(step, image);
-            free(image.data);
+            for (int i = 0; i < frames; i++) {
+                free(images[i].data);
+            }
+
+            free(images);
         }
     }
 
@@ -1200,13 +1234,32 @@ class StableDiffusionGGML {
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
         struct ggml_tensor* preview_tensor = NULL;
-        auto sd_preview_mode = sd_get_preview_mode();
+        auto sd_preview_mode               = sd_get_preview_mode();
         if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
-            preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
-                                                (denoised->ne[0] * 8),
-                                                (denoised->ne[1] * 8),
-                                                3,
-                                                denoised->ne[3]);
+            int64_t W = x->ne[0] * 8;
+            int64_t H = x->ne[1] * 8;
+            if (ggml_n_dims(x) == 4) {
+                // assuming video mode (if batch processing gets implemented this will break)
+                int T = x->ne[2];
+                if (sd_version_is_wan(version)) {
+                    T = ((T - 1) * 4) + 1;
+                    if (version == VERSION_WAN2_2_TI2V) {
+                        W = x->ne[0] * 16;
+                        H = x->ne[1] * 16;
+                    }
+                }
+                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
+                                                    W,
+                                                    H,
+                                                    T,
+                                                    3);
+            } else {
+                preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
+                                                    W,
+                                                    H,
+                                                    3,
+                                                    x->ne[3]);
+            }
         }
 
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
@@ -1378,7 +1431,7 @@ class StableDiffusionGGML {
                 pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
                 // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
             }
-            auto sd_preview_cb = sd_get_preview_callback();
+            auto sd_preview_cb   = sd_get_preview_callback();
             auto sd_preview_mode = sd_get_preview_mode();
             if (sd_preview_cb != NULL) {
                 if (step % sd_get_preview_interval() == 0) {
@@ -1465,12 +1518,12 @@ class StableDiffusionGGML {
                                     -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
                                     0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
                 latents_std_vec  = {
-                     0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                     0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                     0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                     0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                     0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                     0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
+                    0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
+                    0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
+                    0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
+                    0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
+                    0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
+                    0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
             }
             for (int i = 0; i < latent->ne[3]; i++) {
                 float mean = latents_mean_vec[i];
@@ -1505,12 +1558,12 @@ class StableDiffusionGGML {
                                     -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
                                     0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
                 latents_std_vec  = {
-                     0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                     0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                     0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                     0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                     0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                     0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
+                    0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
+                    0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
+                    0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
+                    0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
+                    0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
+                    0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
             }
             for (int i = 0; i < latent->ne[3]; i++) {
                 float mean = latents_mean_vec[i];
@@ -1967,8 +2020,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                                     std::string input_id_images_path,
                                     std::vector<ggml_tensor*> ref_latents,
                                     bool increase_ref_index,
-                                    ggml_tensor* concat_latent                                      = NULL,
-                                    ggml_tensor* denoise_mask                                       = NULL) {
+                                    ggml_tensor* concat_latent = NULL,
+                                    ggml_tensor* denoise_mask  = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
diff --git a/stable-diffusion.h b/stable-diffusion.h
index bd39fc018..779590b60 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -226,7 +226,7 @@ typedef struct sd_ctx_t sd_ctx_t;
 
 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
-typedef void (*sd_preview_cb_t)(int, sd_image_t);
+typedef void (*sd_preview_cb_t)(int, int, sd_image_t*);
 
 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);

From 7a0ab287db32d897e7941673bf444114204ab9fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 31 Aug 2025 02:08:30 +0200
Subject: [PATCH 25/45] latent proj bias

---
 latent-preview.h     | 2 ++
 stable-diffusion.cpp | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/latent-preview.h b/latent-preview.h
index 97be36e0a..5c1606053 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -15,6 +15,7 @@ const float wan_21_latent_rgb_proj[16][3] = {
     {0.0060, -0.0633, 0.0005},
     {0.3477, 0.2275, 0.2950},
     {0.1984, 0.0913, 0.1861}};
+float wan_21_latent_rgb_bias[3] = {-0.1223, -0.1889, -0.1976};
 
 const float wan_22_latent_rgb_proj[48][3] = {
     {0.0119, 0.0103, 0.0046},
@@ -65,6 +66,7 @@ const float wan_22_latent_rgb_proj[48][3] = {
     {0.0421, 0.0451, 0.0373},
     {0.0504, -0.0483, -0.0356},
     {-0.0837, 0.0168, 0.0055}};
+float wan_22_latent_rgb_bias[3] = {0.0317, -0.0878, -0.1388};
 
 // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
 const float flux_latent_rgb_proj[16][3] = {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index b69ddea87..42744cccc 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1045,10 +1045,12 @@ class StableDiffusionGGML {
         if (preview_mode == PREVIEW_PROJ) {
             const float (*latent_rgb_proj)[channel];
             float *latent_rgb_bias;
+            float *latent_rgb_bias;
 
             if (dim == 48) {
                 if (sd_version_is_wan(version)) {
                     latent_rgb_proj = wan_22_latent_rgb_proj;
+                    latent_rgb_bias = wan_22_latent_rgb_bias;
                 } else {
                     LOG_WARN("No latent to RGB projection known for this model");
                     // unknown model
@@ -1062,9 +1064,10 @@ class StableDiffusionGGML {
                     latent_rgb_bias = sd3_latent_rgb_bias;
                 } else if (sd_version_is_flux(version)) {
                     latent_rgb_proj = flux_latent_rgb_proj;
+                    latent_rgb_bias = flux_latent_rgb_bias;
                 } else if (sd_version_is_wan(version)) {
                     latent_rgb_proj = wan_21_latent_rgb_proj;
-                    latent_rgb_bias = flux_latent_rgb_bias;
+                    latent_rgb_bias = wan_21_latent_rgb_bias;
                 } else {
                     LOG_WARN("No latent to RGB projection known for this model");
                     // unknown model

From 3e0ef2796485378145c9846e666bcd8bfab77eda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 9 Sep 2025 13:20:41 +0200
Subject: [PATCH 26/45] fix dup

---
 stable-diffusion.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 42744cccc..716269930 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1045,7 +1045,6 @@ class StableDiffusionGGML {
         if (preview_mode == PREVIEW_PROJ) {
             const float (*latent_rgb_proj)[channel];
             float *latent_rgb_bias;
-            float *latent_rgb_bias;
 
             if (dim == 48) {
                 if (sd_version_is_wan(version)) {

From 70a16116d80fc44b5506306d37a2388be3a82820 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 15 Oct 2025 10:14:04 +0200
Subject: [PATCH 27/45] Support latent2rgb preview for qwen image (via wan21)

---
 stable-diffusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 4646c2cf4..5fee56c2c 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1136,7 +1136,7 @@ class StableDiffusionGGML {
                 } else if (sd_version_is_flux(version)) {
                     latent_rgb_proj = flux_latent_rgb_proj;
                     latent_rgb_bias = flux_latent_rgb_bias;
-                } else if (sd_version_is_wan(version)) {
+                } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
                     latent_rgb_proj = wan_21_latent_rgb_proj;
                     latent_rgb_bias = wan_21_latent_rgb_bias;
                 } else {

From e2ce17dd618e96333c0d13cae136f442bddf7a94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 15 Oct 2025 14:40:17 +0200
Subject: [PATCH 28/45] Fix ctx memory pool size overwritten during merge

---
 stable-diffusion.cpp | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 5fee56c2c..a0349dda8 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -2638,23 +2638,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
     }
 
     struct ggml_init_params params;
-    params.mem_size = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-    if (sd_version_is_sd3(sd_ctx->sd->version)) {
-        params.mem_size *= 3;
-    }
-    if (sd_version_is_flux(sd_ctx->sd->version)) {
-        params.mem_size *= 4;
-    }
-    if (sd_ctx->sd->stacked_id) {
-        params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-    }
-    auto sd_preview_mode = sd_get_preview_mode();
-    if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
-        params.mem_size *= 2;
-    }
-    params.mem_size += width * height * 3 * sizeof(float) * 3;
-    params.mem_size += width * height * 3 * sizeof(float) * 3 * sd_img_gen_params->ref_images_count;
-    params.mem_size *= sd_img_gen_params->batch_count;
+    params.mem_size   = static_cast<size_t>(1024 * 1024) * 1024;  // 1G
     params.mem_buffer = NULL;
     params.no_alloc   = false;
     // LOG_DEBUG("mem_size %u ", params.mem_size);

From f7b53e50456ad8b7ac5eda3072833008b0a77d79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 19 Oct 2025 17:20:45 +0200
Subject: [PATCH 29/45] fix build and update help messages

---
 examples/cli/main.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 1c239e28d..2aeed946e 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -668,7 +668,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          &params.sample_params.shifted_timestep},
         {"",
          "--preview-interval",
-         "How often to save the image preview",
+         "interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)",
           &params.preview_interval},
     };
 
@@ -826,7 +826,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          false, &params.auto_resize_ref_image},
         {"",
          "--taesd-preview-only",
-         "prevents usage of taesd for decoding the final image. (for use with --preview " + previews_str[PREVIEW_TAE] + ")", 
+         std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")", 
          false, &params.taesd_preview},
     };
 
@@ -1157,7 +1157,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "--vae-relative-tile-size",
          "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)",
          on_relative_tile_size_arg},
-        {"", "--preview", "preview method. must be one of " previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "(default is " + previews_str[PREVIEW_NONE] + "(disabled))\n", on_preview_arg},
+        {"",
+         "--preview",
+         std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n",
+         on_preview_arg},
     };
 
     if (!parse_options(argc, argv, options)) {

From 0a59f36b8c4e5af8d0683390736778f78996a463 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sun, 19 Oct 2025 17:23:07 +0200
Subject: [PATCH 30/45] update help message in readme

---
 examples/cli/README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/examples/cli/README.md b/examples/cli/README.md
index ee17d17da..abbb0b6f8 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -32,6 +32,7 @@ Options:
   -o, --output <string>                    path to write result image to (default: ./output.png)
   -p, --prompt <string>                    the prompt to render
   -n, --negative-prompt <string>           the negative prompt (default: "")
+  --preview-path <string>                  path to write preview image to (default: ./preview.png)
   --upscale-model <string>                 path to esrgan model.
   -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                            CPU physical cores
@@ -48,6 +49,8 @@ Options:
   --fps <int>                              fps (default: 24)
   --timestep-shift <int>                   shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for
                                            NitroSD-Vibrant
+  --preview-interval <int>                 interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at
+                                           every step)
   --cfg-scale <float>                      unconditional guidance scale: (default: 7.0)
   --img-cfg-scale <float>                  image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
   --guidance <float>                       distilled guidance scale for models with guidance input (default: 3.5)
@@ -86,6 +89,7 @@ Options:
   --chroma-enable-t5-mask                  enable t5 mask for chroma
   --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
   --disable-auto-resize-ref-image          disable auto resize of ref images
+  --taesd-preview-only                     prevents usage of taesd for decoding the final image. (for use with --preview tae)
   -M, --mode                               run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
   --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                            type of the weight file
@@ -107,4 +111,5 @@ Options:
   --vae-tile-size                          tile size for vae tiling, format [X]x[Y] (default: 32x32)
   --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
                                            (overrides --vae-tile-size)
+  --preview                                preview method. must be one of the following [none, proj, tae, vae] (default is none)
 ```
\ No newline at end of file

From 059f025c5763ba2c47209cabe07c58056425197e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 23 Oct 2025 01:46:06 +0200
Subject: [PATCH 31/45] remove tensor shape spam

---
 stable-diffusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index ba2f0d7fe..a30941377 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1251,7 +1251,7 @@ class StableDiffusionGGML {
             }
 
             sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
-            print_ggml_tensor(result,true);
+            // print_ggml_tensor(result,true);
             for (size_t i = 0; i < frames; i++) {
                 images[i].width   = result->ne[0];
                 images[i].height  = result->ne[1];

From 6563d46cf1eca74c28d47aaf381d7fe46860a35a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 23 Oct 2025 01:51:15 +0200
Subject: [PATCH 32/45] Fix progress display

---
 stable-diffusion.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index a30941377..5fb32d0f9 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1539,10 +1539,6 @@ class StableDiffusionGGML {
             if (denoise_mask != nullptr) {
                 apply_mask(denoised, init_latent, denoise_mask);
             }
-            if (step > 0) {
-                pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
-                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
-            }
             auto sd_preview_cb   = sd_get_preview_callback();
             auto sd_preview_mode = sd_get_preview_mode();
             if (sd_preview_cb != NULL) {

From b1fc7cdec0c83e2fca4ae46f54fa9ad85d995fce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 25 Oct 2025 19:55:22 +0200
Subject: [PATCH 33/45] preview: support pixel space diffusion

---
 latent-preview.h     | 28 ++++++++++++++++++----------
 stable-diffusion.cpp | 18 ++++++++----------
 2 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/latent-preview.h b/latent-preview.h
index 5c1606053..67011837c 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -132,17 +132,25 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
             for (int i = 0; i < width; i++) {
                 size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
                 float r = 0, g = 0, b = 0;
-                for (int d = 0; d < dim; d++) {
-                    float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
-                    r += value * latent_rgb_proj[d][0];
-                    g += value * latent_rgb_proj[d][1];
-                    b += value * latent_rgb_proj[d][2];
+                if(latent_rgb_proj!=NULL){
+                    for (int d = 0; d < dim; d++) {
+                        float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
+                        r += value * latent_rgb_proj[d][0];
+                        g += value * latent_rgb_proj[d][1];
+                        b += value * latent_rgb_proj[d][2];
+                    }
+                } else {
+                    // interpret first 3 channels as RGB
+                    r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]);
+                    g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]);
+                    b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]);
+                }
+                if(latent_rgb_bias!=NULL){
+                    // bias
+                    r += latent_rgb_bias[0];
+                    g += latent_rgb_bias[1];
+                    b += latent_rgb_bias[2];
                 }
-                // bias
-                r += latent_rgb_bias[0];
-                g += latent_rgb_bias[1];
-                b += latent_rgb_bias[2];
-
                 // change range
                 r = r * .5f + .5f;
                 g = g * .5f + .5f;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 36009bcd7..23c02f021 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1155,8 +1155,8 @@ class StableDiffusionGGML {
         uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
 
         if (preview_mode == PREVIEW_PROJ) {
-            const float (*latent_rgb_proj)[channel];
-            float *latent_rgb_bias;
+            const float (*latent_rgb_proj)[channel] = NULL;
+            float *latent_rgb_bias = NULL;
 
             if (dim == 48) {
                 if (sd_version_is_wan(version)) {
@@ -1198,6 +1198,8 @@ class StableDiffusionGGML {
                     LOG_WARN("No latent to RGB projection known for this model");
                     return;
                 }
+            } else if (dim == 4) {
+                // Do nothing, assuming already RGB latents
             } else {
                 LOG_WARN("No latent to RGB projection known for this model");
                 // unknown latent space
@@ -1227,7 +1229,7 @@ class StableDiffusionGGML {
                     auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                         first_stage_model->compute(n_threads, in, true, &out, NULL);
                     };
-                    silent_tiling(latents, result, 8, 32, 0.5f, on_tiling);
+                    silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);
 
                 } else {
                     first_stage_model->compute(n_threads, latents, true, &result, work_ctx);
@@ -1246,7 +1248,7 @@ class StableDiffusionGGML {
                     auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
                         tae_first_stage->compute(n_threads, in, true, &out, NULL);
                     };
-                    silent_tiling(latents, result, 8, 64, 0.5f, on_tiling);
+                    silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
                 } else {
                     tae_first_stage->compute(n_threads, latents, true, &result, work_ctx);
                 }
@@ -1359,17 +1361,13 @@ class StableDiffusionGGML {
         struct ggml_tensor* preview_tensor = NULL;
         auto sd_preview_mode               = sd_get_preview_mode();
         if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
-            int64_t W = x->ne[0] * 8;
-            int64_t H = x->ne[1] * 8;
+            int64_t W = x->ne[0] * get_vae_scale_factor();
+            int64_t H = x->ne[1] * get_vae_scale_factor();
             if (ggml_n_dims(x) == 4) {
                 // assuming video mode (if batch processing gets implemented this will break)
                 int T = x->ne[2];
                 if (sd_version_is_wan(version)) {
                     T = ((T - 1) * 4) + 1;
-                    if (version == VERSION_WAN2_2_TI2V) {
-                        W = x->ne[0] * 16;
-                        H = x->ne[1] * 16;
-                    }
                 }
                 preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
                                                     W,

From 31d36b2ee7ce8ac3b49229bb9228a7ea7cdf7fe0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 25 Oct 2025 19:57:40 +0200
Subject: [PATCH 34/45] include preview (and apply_mask) in speed stats
 properly

---
 stable-diffusion.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 23c02f021..abb5a0e95 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1539,12 +1539,6 @@ class StableDiffusionGGML {
                 vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
             }
 
-            int64_t t1 = ggml_time_us();
-            if (step > 0 || step == -(int)steps) {
-                int showstep = std::abs(step);
-                pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
-                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
-            }
             if (denoise_mask != nullptr) {
                 apply_mask(denoised, init_latent, denoise_mask);
             }
@@ -1555,6 +1549,13 @@ class StableDiffusionGGML {
                     preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb);
                 }
             }
+            
+            int64_t t1 = ggml_time_us();
+            if (step > 0 || step == -(int)steps) {
+                int showstep = std::abs(step);
+                pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep);
+                // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
+            }
             return denoised;
         };
 

From 4e3500c99e8ef80c0b0863410e20e3440aaf7c55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 25 Oct 2025 20:17:59 +0200
Subject: [PATCH 35/45] support noisy preview via API

---
 examples/cli/main.cpp |  7 +++++--
 stable-diffusion.cpp  | 23 +++++++++++++++--------
 stable-diffusion.h    |  4 ++--
 util.cpp              | 14 ++++++++++++--
 util.h                |  2 ++
 5 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 2aeed946e..c693aaee1 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -1506,7 +1506,10 @@ bool load_images_from_dir(const std::string dir,
 const char* preview_path;
 float preview_fps;
 
-void step_callback(int step, int frame_count, sd_image_t* image) {
+void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy) {
+    (void)is_noisy;
+    // is_noisy is set to true if the preview corresponds to noisy latents, false if it's denoised latents
+    // unused in this app, it will either be always noisy or always denoised here
     if (frame_count == 1) {
         stbi_write_png(preview_path, image->width, image->height, image->channel, image->data, 0);
     } else {
@@ -1541,7 +1544,7 @@ int main(int argc, const char* argv[]) {
     params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size();
 
     sd_set_log_callback(sd_log_cb, (void*)&params);
-    sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval);
+    sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, true, false);
 
     if (params.verbose) {
         print_params(params);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index abb5a0e95..18da75c35 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1148,7 +1148,8 @@ class StableDiffusionGGML {
                        enum SDVersion version,
                        preview_t preview_mode,
                        ggml_tensor* result,
-                       std::function<void(int, int, sd_image_t*)> step_callback) {
+                       std::function<void(int, int, sd_image_t*, bool)> step_callback,
+                       bool is_noisy) {
         const uint32_t channel = 3;
         uint32_t width         = latents->ne[0];
         uint32_t height        = latents->ne[1];
@@ -1218,7 +1219,7 @@ class StableDiffusionGGML {
             for (int i = 0; i < frames; i++) {
                 images[i] = {width, height, channel, data + i * width * height * channel};
             }
-            step_callback(step, frames, images);
+            step_callback(step, frames, images, is_noisy);
             free(data);
             free(images);
         } else {
@@ -1272,7 +1273,7 @@ class StableDiffusionGGML {
                 images[i].data    = sd_tensor_to_image(result, i, ggml_n_dims(latents) == 4);
             }
 
-            step_callback(step, frames, images);
+            step_callback(step, frames, images, is_noisy);
             
             ggml_tensor_scale(result, 0);
             for (int i = 0; i < frames; i++) {
@@ -1384,6 +1385,8 @@ class StableDiffusionGGML {
         }
 
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
+            auto sd_preview_cb   = sd_get_preview_callback();
+            auto sd_preview_mode = sd_get_preview_mode();
             if (step == 1 || step == -1) {
                 pretty_progress(0, (int)steps, 0);
             }
@@ -1418,6 +1421,11 @@ class StableDiffusionGGML {
             if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) {
                 apply_mask(noised_input, init_latent, denoise_mask);
             }
+            if (sd_preview_cb != NULL && sd_should_preview_noisy()) {
+                if (step % sd_get_preview_interval() == 0) {
+                    preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, true);
+                }
+            }
 
             std::vector<struct ggml_tensor*> controls;
 
@@ -1542,14 +1550,13 @@ class StableDiffusionGGML {
             if (denoise_mask != nullptr) {
                 apply_mask(denoised, init_latent, denoise_mask);
             }
-            auto sd_preview_cb   = sd_get_preview_callback();
-            auto sd_preview_mode = sd_get_preview_mode();
-            if (sd_preview_cb != NULL) {
+
+            if (sd_preview_cb != NULL && sd_should_preview_denoised()) {
                 if (step % sd_get_preview_interval() == 0) {
-                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb);
+                    preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, false);
                 }
             }
-            
+
             int64_t t1 = ggml_time_us();
             if (step > 0 || step == -(int)steps) {
                 int showstep = std::abs(step);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index e82a7fd81..9e99d53de 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -263,11 +263,11 @@ typedef struct sd_ctx_t sd_ctx_t;
 
 typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
 typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
-typedef void (*sd_preview_cb_t)(int, int, sd_image_t*);
+typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy);
 
 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
-SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval);
+SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy);
 SD_API int32_t get_num_physical_cores();
 SD_API const char* sd_get_system_info();
 
diff --git a/util.cpp b/util.cpp
index 23b6c3b2a..0fa4bbcd7 100644
--- a/util.cpp
+++ b/util.cpp
@@ -189,8 +189,10 @@ static sd_progress_cb_t sd_progress_cb = nullptr;
 void* sd_progress_cb_data              = nullptr;
 
 static sd_preview_cb_t sd_preview_cb = NULL;
-preview_t sd_preview_mode         = PREVIEW_NONE;
+preview_t sd_preview_mode            = PREVIEW_NONE;
 int sd_preview_interval              = 1;
+bool sd_preview_denoised             = true;
+bool sd_preview_noisy                = false;
 
 std::u32string utf8_to_utf32(const std::string& utf8_str) {
     std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
@@ -335,10 +337,12 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
     sd_progress_cb      = cb;
     sd_progress_cb_data = data;
 }
-void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1) {
+void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1, bool denoised = true, bool noisy = false) {
     sd_preview_cb       = cb;
     sd_preview_mode     = mode;
     sd_preview_interval = interval;
+    sd_preview_denoised = denoised;
+    sd_preview_noisy    = noisy;
 }
 
 sd_preview_cb_t sd_get_preview_callback() {
@@ -351,6 +355,12 @@ preview_t sd_get_preview_mode() {
 int sd_get_preview_interval() {
     return sd_preview_interval;
 }
+bool sd_should_preview_denoised() {
+    return sd_preview_denoised;
+}
+bool sd_should_preview_noisy() {
+    return sd_preview_noisy;
+}
 
 sd_progress_cb_t sd_get_progress_callback() {
     return sd_progress_cb;
diff --git a/util.h b/util.h
index 3e34a2f7b..5bd69a624 100644
--- a/util.h
+++ b/util.h
@@ -60,6 +60,8 @@ void* sd_get_progress_callback_data();
 sd_preview_cb_t sd_get_preview_callback();
 preview_t sd_get_preview_mode();
 int sd_get_preview_interval();
+bool sd_should_preview_denoised();
+bool sd_should_preview_noisy();
 
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)

From 27af5a452f8dd67314b30fc5ed8c327673d27967 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 25 Oct 2025 20:28:27 +0200
Subject: [PATCH 36/45] missing includes

---
 latent-preview.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/latent-preview.h b/latent-preview.h
index 67011837c..628e72a9c 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -1,3 +1,6 @@
+#include <cstddef>
+#include <cstdint>
+#include "ggml.h"
 const float wan_21_latent_rgb_proj[16][3] = {
     {-0.1299, -0.1692, 0.2932},
     {0.0671, 0.0406, 0.0442},

From 07c61f1afe1277450d2b31d8bc3b11fdba9ba8a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 25 Oct 2025 20:49:33 +0200
Subject: [PATCH 37/45] supports noisy preview in main

---
 examples/cli/main.cpp | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index c693aaee1..89de4c251 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -146,13 +146,13 @@ struct SDParams {
     int preview_interval     = 1;
     std::string preview_path = "preview.png";
     bool taesd_preview       = false;
+    bool preview_noisy       = false;
 
     SDParams() {
         sd_sample_params_init(&sample_params);
         sd_sample_params_init(&high_noise_sample_params);
         high_noise_sample_params.sample_steps = -1;
     }
-
 };
 
 void print_params(SDParams params) {
@@ -223,7 +223,7 @@ void print_params(SDParams params) {
     printf("    video_frames:                      %d\n", params.video_frames);
     printf("    vace_strength:                     %.2f\n", params.vace_strength);
     printf("    fps:                               %d\n", params.fps);
-    printf("    preview_mode:                      %s\n", previews_str[params.preview_method]);
+    printf("    preview_mode:                      %s (%s)\n", previews_str[params.preview_method], params.preview_noisy ? "noisy" : "denoised");
     printf("    preview_interval:                  %d\n", params.preview_interval);
     free(sample_params_str);
     free(high_noise_sample_params_str);
@@ -604,7 +604,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "--negative-prompt",
          "the negative prompt (default: \"\")",
          &params.negative_prompt},
-        {"", 
+        {"",
          "--preview-path",
          "path to write preview image to (default: ./preview.png)",
          &params.preview_path},
@@ -669,7 +669,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"",
          "--preview-interval",
          "interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)",
-          &params.preview_interval},
+         &params.preview_interval},
     };
 
     options.float_options = {
@@ -826,8 +826,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          false, &params.auto_resize_ref_image},
         {"",
          "--taesd-preview-only",
-         std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")", 
-         false, &params.taesd_preview},
+         std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")",
+         true, &params.taesd_preview},
+         {"",
+        "--preview-noisy",
+        "enables previewing noisy inputs of the models rather than the denoised outputs",
+        true, &params.preview_noisy}
     };
 
     auto on_mode_arg = [&](int argc, const char** argv, int index) {
@@ -1507,6 +1511,7 @@ const char* preview_path;
 float preview_fps;
 
 void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy) {
+    (void)step;
     (void)is_noisy;
     // is_noisy is set to true if the preview corresponds to noisy latents, false if it's denoised latents
     // unused in this app, it will either be always noisy or always denoised here
@@ -1531,7 +1536,8 @@ int main(int argc, const char* argv[]) {
             std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower);
         }
         if (file_ext == ".png") {
-            preview_path = (base_path + ".avi").c_str();
+            base_path    = base_path + ".avi";
+            preview_path = base_path.c_str();
         }
     }
     preview_fps = params.fps;
@@ -1544,7 +1550,7 @@ int main(int argc, const char* argv[]) {
     params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size();
 
     sd_set_log_callback(sd_log_cb, (void*)&params);
-    sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, true, false);
+    sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy);
 
     if (params.verbose) {
         print_params(params);

From f80f61a66998d4364db14d0bd765674f0be0fd2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 25 Oct 2025 20:50:18 +0200
Subject: [PATCH 38/45] fix tae-preview-only (bad merge issue)

---
 stable-diffusion.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 18da75c35..af5ef00b1 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -484,7 +484,7 @@ class StableDiffusionGGML {
                 vae_decode_only = false;
             }
 
-            if (high_noise_diffusion_model || sd_ctx_params->tae_preview_only) {
+            if (high_noise_diffusion_model) {
                 high_noise_diffusion_model->alloc_params_buffer();
                 high_noise_diffusion_model->get_param_tensors(tensors);
             }
@@ -508,7 +508,7 @@ class StableDiffusionGGML {
             } else if (version == VERSION_CHROMA_RADIANCE) {
                 first_stage_model = std::make_shared<FakeVAE>(vae_backend,
                                                               offload_params_to_cpu);
-            } else if (!use_tiny_autoencoder) {
+            } else if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
                 first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend,
                                                                     offload_params_to_cpu,
                                                                     model_loader.tensor_storages_types,

From 6c68e395ab87392e1e466097bc11efc66ed357aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 25 Oct 2025 20:53:49 +0200
Subject: [PATCH 39/45] format code

---
 examples/cli/main.cpp |  9 ++++-----
 latent-preview.h      |  6 +++---
 stable-diffusion.cpp  | 18 +++++++++---------
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 89de4c251..619c42847 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -828,11 +828,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "--taesd-preview-only",
          std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")",
          true, &params.taesd_preview},
-         {"",
-        "--preview-noisy",
-        "enables previewing noisy inputs of the models rather than the denoised outputs",
-        true, &params.preview_noisy}
-    };
+        {"",
+         "--preview-noisy",
+         "enables previewing noisy inputs of the models rather than the denoised outputs",
+         true, &params.preview_noisy}};
 
     auto on_mode_arg = [&](int argc, const char** argv, int index) {
         if (++index >= argc) {
diff --git a/latent-preview.h b/latent-preview.h
index 628e72a9c..aa0939eba 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -126,7 +126,7 @@ const float sd_latent_rgb_proj[4][3]{
     {0.3250f, 0.4974f, 0.2350f},
     {-0.2829f, 0.1762f, 0.2721f},
     {-0.2120f, -0.2616f, -0.7177f}};
-float sd_latent_rgb_bias[3] = {0,0,0};
+float sd_latent_rgb_bias[3] = {0, 0, 0};
 
 void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
     size_t buffer_head = 0;
@@ -135,7 +135,7 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
             for (int i = 0; i < width; i++) {
                 size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
                 float r = 0, g = 0, b = 0;
-                if(latent_rgb_proj!=NULL){
+                if (latent_rgb_proj != NULL) {
                     for (int d = 0; d < dim; d++) {
                         float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
                         r += value * latent_rgb_proj[d][0];
@@ -148,7 +148,7 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
                     g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]);
                     b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]);
                 }
-                if(latent_rgb_bias!=NULL){
+                if (latent_rgb_bias != NULL) {
                     // bias
                     r += latent_rgb_bias[0];
                     g += latent_rgb_bias[1];
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index af5ef00b1..e89e5fb33 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -369,7 +369,7 @@ class StableDiffusionGGML {
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
                                                                 offload_params_to_cpu,
                                                                 sd_ctx_params->diffusion_flash_attn,
-                                                               model_loader.tensor_storages_types);
+                                                                model_loader.tensor_storages_types);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : model_loader.tensor_storages_types) {
@@ -443,11 +443,11 @@ class StableDiffusionGGML {
                                                                             "",
                                                                             enable_vision);
                 diffusion_model  = std::make_shared<QwenImageModel>(backend,
-                                                                   offload_params_to_cpu,
-                                                                   model_loader.tensor_storages_types,
-                                                                   "model.diffusion_model",
-                                                                   version,
-                                                                   sd_ctx_params->diffusion_flash_attn);
+                                                                    offload_params_to_cpu,
+                                                                    model_loader.tensor_storages_types,
+                                                                    "model.diffusion_model",
+                                                                    version,
+                                                                    sd_ctx_params->diffusion_flash_attn);
             } else {  // SD1.x SD2.x SDXL
                 if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
@@ -1157,7 +1157,7 @@ class StableDiffusionGGML {
 
         if (preview_mode == PREVIEW_PROJ) {
             const float (*latent_rgb_proj)[channel] = NULL;
-            float *latent_rgb_bias = NULL;
+            float* latent_rgb_bias                  = NULL;
 
             if (dim == 48) {
                 if (sd_version_is_wan(version)) {
@@ -1214,7 +1214,7 @@ class StableDiffusionGGML {
 
             uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t));
 
-            preview_latent_video(data, latents, latent_rgb_proj,latent_rgb_bias, width, height, frames, dim);
+            preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim);
             sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t));
             for (int i = 0; i < frames; i++) {
                 images[i] = {width, height, channel, data + i * width * height * channel};
@@ -1274,7 +1274,7 @@ class StableDiffusionGGML {
             }
 
             step_callback(step, frames, images, is_noisy);
-            
+
             ggml_tensor_scale(result, 0);
             for (int i = 0; i < frames; i++) {
                 free(images[i].data);

From fc2a71e56f0c53d7b6687cd7251ed4c0049bb6a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Sat, 25 Oct 2025 21:36:39 +0200
Subject: [PATCH 40/45] update help in readme

---
 examples/cli/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/cli/README.md b/examples/cli/README.md
index abbb0b6f8..00e0942f1 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -90,6 +90,7 @@ Options:
   --increase-ref-index                     automatically increase the indices of references images based on the order they are listed (starting with 1).
   --disable-auto-resize-ref-image          disable auto resize of ref images
   --taesd-preview-only                     prevents usage of taesd for decoding the final image. (for use with --preview tae)
+  --preview-noisy                          enables previewing noisy inputs of the models rather than the denoised outputs
   -M, --mode                               run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
   --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                            type of the weight file

From 8a3346f8cd6c833d582ed92ec5e9a735fc2e10d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 28 Oct 2025 14:55:06 +0100
Subject: [PATCH 41/45] use bespoke latent to rgb projection to prevent
 licensing issues

---
 latent-preview.h | 205 ++++++++++++++++++++++++-----------------------
 1 file changed, 105 insertions(+), 100 deletions(-)

diff --git a/latent-preview.h b/latent-preview.h
index aa0939eba..4f83cfc6d 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -1,97 +1,102 @@
 #include <cstddef>
 #include <cstdint>
 #include "ggml.h"
+
 const float wan_21_latent_rgb_proj[16][3] = {
-    {-0.1299, -0.1692, 0.2932},
-    {0.0671, 0.0406, 0.0442},
-    {0.3568, 0.2548, 0.1747},
-    {0.0372, 0.2344, 0.1420},
-    {0.0313, 0.0189, -0.0328},
-    {0.0296, -0.0956, -0.0665},
-    {-0.3477, -0.4059, -0.2925},
-    {0.0166, 0.1902, 0.1975},
-    {-0.0412, 0.0267, -0.1364},
-    {-0.1293, 0.0740, 0.1636},
-    {0.0680, 0.3019, 0.1128},
-    {0.0032, 0.0581, 0.0639},
-    {-0.1251, 0.0927, 0.1699},
-    {0.0060, -0.0633, 0.0005},
-    {0.3477, 0.2275, 0.2950},
-    {0.1984, 0.0913, 0.1861}};
-float wan_21_latent_rgb_bias[3] = {-0.1223, -0.1889, -0.1976};
+    {0.015123f, -0.148418f, 0.479828f},
+    {0.003652f, -0.010680f, -0.037142f},
+    {0.212264f, 0.063033f, 0.016779f},
+    {0.232999f, 0.406476f, 0.220125f},
+    {-0.051864f, -0.082384f, -0.069396f},
+    {0.085005f, -0.161492f, 0.010689f},
+    {-0.245369f, -0.506846f, -0.117010f},
+    {-0.151145f, 0.017721f, 0.007207f},
+    {-0.293239f, -0.207936f, -0.421135f},
+    {-0.187721f, 0.050783f, 0.177649f},
+    {-0.013067f, 0.265964f, 0.166578f},
+    {0.028327f, 0.109329f, 0.108642f},
+    {-0.205343f, 0.043991f, 0.148914f},
+    {0.014307f, -0.048647f, -0.007219f},
+    {0.217150f, 0.053074f, 0.319923f},
+    {0.155357f, 0.083156f, 0.064780f}
+};
+float wan_21_latent_rgb_bias[3] =  {-0.270270f, -0.234976f, -0.456853f};
 
 const float wan_22_latent_rgb_proj[48][3] = {
-    {0.0119, 0.0103, 0.0046},
-    {-0.1062, -0.0504, 0.0165},
-    {0.0140, 0.0409, 0.0491},
-    {-0.0813, -0.0677, 0.0607},
-    {0.0656, 0.0851, 0.0808},
-    {0.0264, 0.0463, 0.0912},
-    {0.0295, 0.0326, 0.0590},
-    {-0.0244, -0.0270, 0.0025},
-    {0.0443, -0.0102, 0.0288},
-    {-0.0465, -0.0090, -0.0205},
-    {0.0359, 0.0236, 0.0082},
-    {-0.0776, 0.0854, 0.1048},
-    {0.0564, 0.0264, 0.0561},
-    {0.0006, 0.0594, 0.0418},
-    {-0.0319, -0.0542, -0.0637},
-    {-0.0268, 0.0024, 0.0260},
-    {0.0539, 0.0265, 0.0358},
-    {-0.0359, -0.0312, -0.0287},
-    {-0.0285, -0.1032, -0.1237},
-    {0.1041, 0.0537, 0.0622},
-    {-0.0086, -0.0374, -0.0051},
-    {0.0390, 0.0670, 0.2863},
-    {0.0069, 0.0144, 0.0082},
-    {0.0006, -0.0167, 0.0079},
-    {0.0313, -0.0574, -0.0232},
-    {-0.1454, -0.0902, -0.0481},
-    {0.0714, 0.0827, 0.0447},
-    {-0.0304, -0.0574, -0.0196},
-    {0.0401, 0.0384, 0.0204},
-    {-0.0758, -0.0297, -0.0014},
-    {0.0568, 0.1307, 0.1372},
-    {-0.0055, -0.0310, -0.0380},
-    {0.0239, -0.0305, 0.0325},
-    {-0.0663, -0.0673, -0.0140},
-    {-0.0416, -0.0047, -0.0023},
-    {0.0166, 0.0112, -0.0093},
-    {-0.0211, 0.0011, 0.0331},
-    {0.1833, 0.1466, 0.2250},
-    {-0.0368, 0.0370, 0.0295},
-    {-0.3441, -0.3543, -0.2008},
-    {-0.0479, -0.0489, -0.0420},
-    {-0.0660, -0.0153, 0.0800},
-    {-0.0101, 0.0068, 0.0156},
-    {-0.0690, -0.0452, -0.0927},
-    {-0.0145, 0.0041, 0.0015},
-    {0.0421, 0.0451, 0.0373},
-    {0.0504, -0.0483, -0.0356},
-    {-0.0837, 0.0168, 0.0055}};
-float wan_22_latent_rgb_bias[3] = {0.0317, -0.0878, -0.1388};
+    {0.017126f, -0.027230f, -0.019257f},
+    {-0.113739f, -0.028715f, -0.022885f},
+    {-0.000106f, 0.021494f, 0.004629f},
+    {-0.013273f, -0.107137f, -0.033638f},
+    {-0.000381f, 0.000279f, 0.025877f},
+    {-0.014216f, -0.003975f, 0.040528f},
+    {0.001638f, -0.000748f, 0.011022f},
+    {0.029238f, -0.006697f, 0.035933f},
+    {0.021641f, -0.015874f, 0.040531f},
+    {-0.101984f, -0.070160f, -0.028855f},
+    {0.033207f, -0.021068f, 0.002663f},
+    {-0.104711f, 0.121673f, 0.102981f},
+    {0.082647f, -0.004991f, 0.057237f},
+    {-0.027375f, 0.031581f, 0.006868f},
+    {-0.045434f, 0.029444f, 0.019287f},
+    {-0.046572f, -0.012537f, 0.006675f},
+    {0.074709f, 0.033690f, 0.025289f},
+    {-0.008251f, -0.002745f, -0.006999f},
+    {0.012685f, -0.061856f, -0.048658f},
+    {0.042304f, -0.007039f, 0.000295f},
+    {-0.007644f, -0.060843f, -0.033142f},
+    {0.159909f, 0.045628f, 0.367541f},
+    {0.095171f, 0.086438f, 0.010271f},
+    {0.006812f, 0.019643f, 0.029637f},
+    {0.003467f, -0.010705f, 0.014252f},
+    {-0.099681f, -0.066272f, -0.006243f},
+    {0.047357f, 0.037040f, 0.000185f},
+    {-0.041797f, -0.089225f, -0.032257f},
+    {0.008928f, 0.017028f, 0.018684f},
+    {-0.042255f, 0.016045f, 0.006849f},
+    {0.011268f, 0.036462f, 0.037387f},
+    {0.011553f, -0.016375f, -0.048589f},
+    {0.046266f, -0.027189f, 0.056979f},
+    {0.009640f, -0.017576f, 0.030324f},
+    {-0.045794f, -0.036083f, -0.010616f},
+    {0.022418f, 0.039783f, -0.032939f},
+    {-0.052714f, -0.015525f, 0.007438f},
+    {0.193004f, 0.223541f, 0.264175f},
+    {-0.059406f, -0.008188f, 0.022867f},
+    {-0.156742f, -0.263791f, -0.007385f},
+    {-0.015717f, 0.016570f, 0.033969f},
+    {0.037969f, 0.109835f, 0.200449f},
+    {-0.000782f, -0.009566f, -0.008058f},
+    {0.010709f, 0.052960f, -0.044195f},
+    {0.017271f, 0.045839f, 0.034569f},
+    {0.009424f, 0.013088f, -0.001714f},
+    {-0.024805f, -0.059378f, -0.033756f},
+    {-0.078293f, 0.029070f, 0.026129f}
+};
+float wan_22_latent_rgb_bias[3] =  {0.013160f, -0.096492f, -0.071323f};
 
-// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
 const float flux_latent_rgb_proj[16][3] = {
-    {-0.0346f, 0.0244f, 0.0681f},
-    {0.0034f, 0.0210f, 0.0687f},
-    {0.0275f, -0.0668f, -0.0433f},
-    {-0.0174f, 0.0160f, 0.0617f},
-    {0.0859f, 0.0721f, 0.0329f},
-    {0.0004f, 0.0383f, 0.0115f},
-    {0.0405f, 0.0861f, 0.0915f},
-    {-0.0236f, -0.0185f, -0.0259f},
-    {-0.0245f, 0.0250f, 0.1180f},
-    {0.1008f, 0.0755f, -0.0421f},
-    {-0.0515f, 0.0201f, 0.0011f},
-    {0.0428f, -0.0012f, -0.0036f},
-    {0.0817f, 0.0765f, 0.0749f},
-    {-0.1264f, -0.0522f, -0.1103f},
-    {-0.0280f, -0.0881f, -0.0499f},
-    {-0.1262f, -0.0982f, -0.0778f}};
-float flux_latent_rgb_bias[3] = {-0.0329, -0.0718, -0.0851};
+    {-0.041168f, 0.019917f, 0.097253f},
+    {0.028096f, 0.026730f, 0.129576f},
+    {0.065618f, -0.067950f, -0.014651f},
+    {-0.012998f, -0.014762f, 0.081251f},
+    {0.078567f, 0.059296f, -0.024687f},
+    {-0.015987f, -0.003697f, 0.005012f},
+    {0.033605f, 0.138999f, 0.068517f},
+    {-0.024450f, -0.063567f, -0.030101f},
+    {-0.040194f, -0.016710f, 0.127185f},
+    {0.112681f, 0.088764f, -0.041940f},
+    {-0.023498f, 0.093664f, 0.025543f},
+    {0.082899f, 0.048320f, 0.007491f},
+    {0.075712f, 0.074139f, 0.081965f},
+    {-0.143501f, 0.018263f, -0.136138f},
+    {-0.025767f, -0.082035f, -0.040023f},
+    {-0.111849f, -0.055589f, -0.032361f}
+};
+float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
 
-// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
+// This one was taken straight from 
+// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
+// (MiT Licence)
 const float sd3_latent_rgb_proj[16][3] = {
     {-0.0645f, 0.0177f, 0.1052f},
     {0.0028f, 0.0312f, 0.0650f},
@@ -110,23 +115,23 @@ const float sd3_latent_rgb_proj[16][3] = {
     {-0.0749f, -0.0634f, -0.0456f},
     {-0.1418f, -0.1457f, -0.1259f},
 };
-float sd3_latent_rgb_bias[3] = {0, 0, 0};
+float sd3_latent_rgb_bias[3] = NULL;
 
-// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
-const float sdxl_latent_rgb_proj[4][3] = {
-    {0.3651f, 0.4232f, 0.4341f},
-    {-0.2533f, -0.0042f, 0.1068f},
-    {0.1076f, 0.1111f, -0.0362f},
-    {-0.3165f, -0.2492f, -0.2188f}};
-float sdxl_latent_rgb_bias[3] = {0.1084, -0.0175, -0.0011};
+const float sdxl_latent_rgb_proj[4][3] =   {
+    {0.258303f, 0.277640f, 0.329699f},
+    {-0.299701f, 0.105446f, 0.014194f},
+    {0.050522f, 0.186163f, -0.143257f},
+    {-0.211938f, -0.149892f, -0.080036f}
+};
+float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f};
 
-// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
-const float sd_latent_rgb_proj[4][3]{
-    {0.3512f, 0.2297f, 0.3227f},
-    {0.3250f, 0.4974f, 0.2350f},
-    {-0.2829f, 0.1762f, 0.2721f},
-    {-0.2120f, -0.2616f, -0.7177f}};
-float sd_latent_rgb_bias[3] = {0, 0, 0};
+const float sd_latent_rgb_proj[4][3] = {
+    {0.337366f, 0.216344f, 0.257386f},
+    {0.165636f, 0.386828f, 0.046994f},
+    {-0.267803f, 0.237036f, 0.223517f},
+    {-0.178022f, -0.200862f, -0.678514f}
+};
+float sd_latent_rgb_bias[3] ={-0.017478f, -0.055834f, -0.105825f};
 
 void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
     size_t buffer_head = 0;

From b5e73f9c1acf80f9b2cd5c3e8b01c1614c1c60b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 28 Oct 2025 15:04:08 +0100
Subject: [PATCH 42/45] fix sd3 null bias breaking build

---
 latent-preview.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/latent-preview.h b/latent-preview.h
index 4f83cfc6d..354f8c3a4 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -13,7 +13,7 @@ const float wan_21_latent_rgb_proj[16][3] = {
     {-0.151145f, 0.017721f, 0.007207f},
     {-0.293239f, -0.207936f, -0.421135f},
     {-0.187721f, 0.050783f, 0.177649f},
-    {-0.013067f, 0.265964f, 0.166578f},
+    {-0.013067f, 0.265964f, 0.166578f       },
     {0.028327f, 0.109329f, 0.108642f},
     {-0.205343f, 0.043991f, 0.148914f},
     {0.014307f, -0.048647f, -0.007219f},
@@ -115,7 +115,7 @@ const float sd3_latent_rgb_proj[16][3] = {
     {-0.0749f, -0.0634f, -0.0456f},
     {-0.1418f, -0.1457f, -0.1259f},
 };
-float sd3_latent_rgb_bias[3] = NULL;
+float sd3_latent_rgb_bias[3] = {0, 0, 0};
 
 const float sdxl_latent_rgb_proj[4][3] =   {
     {0.258303f, 0.277640f, 0.329699f},

From c1226d6f5332a5366bbf4c5ca6af59b311f379f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Tue, 28 Oct 2025 16:40:06 +0100
Subject: [PATCH 43/45] use new ggml_ext function names

---
 stable-diffusion.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 3bae2af07..c325a284c 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1250,7 +1250,7 @@ class StableDiffusionGGML {
                 return;
             }
 
-            ggml_tensor_clamp(result, 0.0f, 1.0f);
+            ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
             uint32_t frames = 1;
             if (ggml_n_dims(latents) == 4) {
                 frames = result->ne[2];
@@ -1262,12 +1262,12 @@ class StableDiffusionGGML {
                 images[i].width   = result->ne[0];
                 images[i].height  = result->ne[1];
                 images[i].channel = 3;
-                images[i].data    = sd_tensor_to_image(result, i, ggml_n_dims(latents) == 4);
+                images[i].data    = ggml_tensor_to_sd_image(result, i, ggml_n_dims(latents) == 4);
             }
 
             step_callback(step, frames, images, is_noisy);
 
-            ggml_tensor_scale(result, 0);
+            ggml_ext_tensor_scale_inplace(result, 0);
             for (int i = 0; i < frames; i++) {
                 free(images[i].data);
             }

From 3db7fb14370a979edc3cd51437aaedfbaf42766d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 29 Oct 2025 11:20:15 +0100
Subject: [PATCH 44/45] Fix radiance proj support

---
 stable-diffusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index c325a284c..9b6e2fc56 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1191,7 +1191,7 @@ class StableDiffusionGGML {
                     LOG_WARN("No latent to RGB projection known for this model");
                     return;
                 }
-            } else if (dim == 4) {
+            } else if (dim == 3) {
                 // Do nothing, assuming already RGB latents
             } else {
                 LOG_WARN("No latent to RGB projection known for this model");

From 044f0ed89055d6513b96620701e737cebe5803da Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Mon, 10 Nov 2025 00:00:29 +0800
Subject: [PATCH 45/45] format code

---
 latent-preview.h     | 31 +++++++++++++------------------
 stable-diffusion.cpp | 40 ++++++++++++++++++++--------------------
 util.cpp             |  2 +-
 3 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/latent-preview.h b/latent-preview.h
index 354f8c3a4..97409a7d8 100644
--- a/latent-preview.h
+++ b/latent-preview.h
@@ -13,14 +13,13 @@ const float wan_21_latent_rgb_proj[16][3] = {
     {-0.151145f, 0.017721f, 0.007207f},
     {-0.293239f, -0.207936f, -0.421135f},
     {-0.187721f, 0.050783f, 0.177649f},
-    {-0.013067f, 0.265964f, 0.166578f       },
+    {-0.013067f, 0.265964f, 0.166578f},
     {0.028327f, 0.109329f, 0.108642f},
     {-0.205343f, 0.043991f, 0.148914f},
     {0.014307f, -0.048647f, -0.007219f},
     {0.217150f, 0.053074f, 0.319923f},
-    {0.155357f, 0.083156f, 0.064780f}
-};
-float wan_21_latent_rgb_bias[3] =  {-0.270270f, -0.234976f, -0.456853f};
+    {0.155357f, 0.083156f, 0.064780f}};
+float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f};
 
 const float wan_22_latent_rgb_proj[48][3] = {
     {0.017126f, -0.027230f, -0.019257f},
@@ -70,9 +69,8 @@ const float wan_22_latent_rgb_proj[48][3] = {
     {0.017271f, 0.045839f, 0.034569f},
     {0.009424f, 0.013088f, -0.001714f},
     {-0.024805f, -0.059378f, -0.033756f},
-    {-0.078293f, 0.029070f, 0.026129f}
-};
-float wan_22_latent_rgb_bias[3] =  {0.013160f, -0.096492f, -0.071323f};
+    {-0.078293f, 0.029070f, 0.026129f}};
+float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f};
 
 const float flux_latent_rgb_proj[16][3] = {
     {-0.041168f, 0.019917f, 0.097253f},
@@ -90,11 +88,10 @@ const float flux_latent_rgb_proj[16][3] = {
     {0.075712f, 0.074139f, 0.081965f},
     {-0.143501f, 0.018263f, -0.136138f},
     {-0.025767f, -0.082035f, -0.040023f},
-    {-0.111849f, -0.055589f, -0.032361f}
-};
+    {-0.111849f, -0.055589f, -0.032361f}};
 float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f};
 
-// This one was taken straight from 
+// This one was taken straight from
 // https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303
 // (MiT Licence)
 const float sd3_latent_rgb_proj[16][3] = {
@@ -117,21 +114,19 @@ const float sd3_latent_rgb_proj[16][3] = {
 };
 float sd3_latent_rgb_bias[3] = {0, 0, 0};
 
-const float sdxl_latent_rgb_proj[4][3] =   {
+const float sdxl_latent_rgb_proj[4][3] = {
     {0.258303f, 0.277640f, 0.329699f},
     {-0.299701f, 0.105446f, 0.014194f},
     {0.050522f, 0.186163f, -0.143257f},
-    {-0.211938f, -0.149892f, -0.080036f}
-};
+    {-0.211938f, -0.149892f, -0.080036f}};
 float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f};
 
 const float sd_latent_rgb_proj[4][3] = {
     {0.337366f, 0.216344f, 0.257386f},
     {0.165636f, 0.386828f, 0.046994f},
     {-0.267803f, 0.237036f, 0.223517f},
-    {-0.178022f, -0.200862f, -0.678514f}
-};
-float sd_latent_rgb_bias[3] ={-0.017478f, -0.055834f, -0.105825f};
+    {-0.178022f, -0.200862f, -0.678514f}};
+float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f};
 
 void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) {
     size_t buffer_head = 0;
@@ -140,7 +135,7 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
             for (int i = 0; i < width; i++) {
                 size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]);
                 float r = 0, g = 0, b = 0;
-                if (latent_rgb_proj != NULL) {
+                if (latent_rgb_proj != nullptr) {
                     for (int d = 0; d < dim; d++) {
                         float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]);
                         r += value * latent_rgb_proj[d][0];
@@ -153,7 +148,7 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl
                     g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]);
                     b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]);
                 }
-                if (latent_rgb_bias != NULL) {
+                if (latent_rgb_bias != nullptr) {
                     // bias
                     r += latent_rgb_bias[0];
                     g += latent_rgb_bias[1];
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 91a6eff05..b675b85b0 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1124,7 +1124,7 @@ class StableDiffusionGGML {
     void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
         sd_progress_cb_t cb = sd_get_progress_callback();
         void* cbd           = sd_get_progress_callback_data();
-        sd_set_progress_callback((sd_progress_cb_t)suppress_pp, NULL);
+        sd_set_progress_callback((sd_progress_cb_t)suppress_pp, nullptr);
         sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing);
         sd_set_progress_callback(cb, cbd);
     }
@@ -1143,8 +1143,8 @@ class StableDiffusionGGML {
         uint32_t dim           = latents->ne[ggml_n_dims(latents) - 1];
 
         if (preview_mode == PREVIEW_PROJ) {
-            const float (*latent_rgb_proj)[channel] = NULL;
-            float* latent_rgb_bias                  = NULL;
+            const float(*latent_rgb_proj)[channel] = nullptr;
+            float* latent_rgb_bias                 = nullptr;
 
             if (dim == 48) {
                 if (sd_version_is_wan(version)) {
@@ -1215,7 +1215,7 @@ class StableDiffusionGGML {
                 if (vae_tiling_params.enabled) {
                     // split latent in 32x32 tiles and compute in several steps
                     auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                        first_stage_model->compute(n_threads, in, true, &out, NULL);
+                        first_stage_model->compute(n_threads, in, true, &out, nullptr);
                     };
                     silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);
 
@@ -1234,7 +1234,7 @@ class StableDiffusionGGML {
                 if (vae_tiling_params.enabled) {
                     // split latent in 64x64 tiles and compute in several steps
                     auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                        tae_first_stage->compute(n_threads, in, true, &out, NULL);
+                        tae_first_stage->compute(n_threads, in, true, &out, nullptr);
                     };
                     silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
                 } else {
@@ -1346,7 +1346,7 @@ class StableDiffusionGGML {
 
         int64_t t0 = ggml_time_us();
 
-        struct ggml_tensor* preview_tensor = NULL;
+        struct ggml_tensor* preview_tensor = nullptr;
         auto sd_preview_mode               = sd_get_preview_mode();
         if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) {
             int64_t W = x->ne[0] * get_vae_scale_factor();
@@ -1408,7 +1408,7 @@ class StableDiffusionGGML {
             if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) {
                 apply_mask(noised_input, init_latent, denoise_mask);
             }
-            if (sd_preview_cb != NULL && sd_should_preview_noisy()) {
+            if (sd_preview_cb != nullptr && sd_should_preview_noisy()) {
                 if (step % sd_get_preview_interval() == 0) {
                     preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, true);
                 }
@@ -1538,7 +1538,7 @@ class StableDiffusionGGML {
                 apply_mask(denoised, init_latent, denoise_mask);
             }
 
-            if (sd_preview_cb != NULL && sd_should_preview_denoised()) {
+            if (sd_preview_cb != nullptr && sd_should_preview_denoised()) {
                 if (step % sd_get_preview_interval() == 0) {
                     preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, false);
                 }
@@ -1629,12 +1629,12 @@ class StableDiffusionGGML {
                                     -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
                                     0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
                 latents_std_vec  = {
-                    0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                    0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                    0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                    0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                    0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                    0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
+                     0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
+                     0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
+                     0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
+                     0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
+                     0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
+                     0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
             }
             for (int i = 0; i < latent->ne[3]; i++) {
                 float mean = latents_mean_vec[i];
@@ -1675,12 +1675,12 @@ class StableDiffusionGGML {
                                     -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f,
                                     0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f};
                 latents_std_vec  = {
-                    0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
-                    0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
-                    0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
-                    0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
-                    0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
-                    0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
+                     0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f,
+                     0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f,
+                     0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f,
+                     0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f,
+                     0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f,
+                     0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f};
             }
             for (int i = 0; i < latent->ne[3]; i++) {
                 float mean = latents_mean_vec[i];
diff --git a/util.cpp b/util.cpp
index b30f5cef9..1aa9beff8 100644
--- a/util.cpp
+++ b/util.cpp
@@ -185,7 +185,7 @@ int32_t get_num_physical_cores() {
 static sd_progress_cb_t sd_progress_cb = nullptr;
 void* sd_progress_cb_data              = nullptr;
 
-static sd_preview_cb_t sd_preview_cb = NULL;
+static sd_preview_cb_t sd_preview_cb = nullptr;
 preview_t sd_preview_mode            = PREVIEW_NONE;
 int sd_preview_interval              = 1;
 bool sd_preview_denoised             = true;