From e8ac336d91b855675e06b15a600a945e0e48da48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:43 +0200 Subject: [PATCH 01/45] fast latent image preview --- examples/cli/main.cpp | 124 +++++++++++++++++++++++++++++++++++++++++- stable-diffusion.cpp | 19 +++++-- stable-diffusion.h | 4 +- 3 files changed, 139 insertions(+), 8 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index ec04dfde3..ef7877aa1 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -9,6 +9,9 @@ #include #include +#include "model.h" + + // #include "preprocessing.hpp" #include "stable-diffusion.h" @@ -752,6 +755,125 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { fflush(out_stream); } +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 +const float flux_latent_rgb_proj[16][3] = { + {-0.0346, 0.0244, 0.0681}, + {0.0034, 0.0210, 0.0687}, + {0.0275, -0.0668, -0.0433}, + {-0.0174, 0.0160, 0.0617}, + {0.0859, 0.0721, 0.0329}, + {0.0004, 0.0383, 0.0115}, + {0.0405, 0.0861, 0.0915}, + {-0.0236, -0.0185, -0.0259}, + {-0.0245, 0.0250, 0.1180}, + {0.1008, 0.0755, -0.0421}, + {-0.0515, 0.0201, 0.0011}, + {0.0428, -0.0012, -0.0036}, + {0.0817, 0.0765, 0.0749}, + {-0.1264, -0.0522, -0.1103}, + {-0.0280, -0.0881, -0.0499}, + {-0.1262, -0.0982, -0.0778}}; + +// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 +const float sd3_latent_rgb_proj[16][3] = { + {-0.0645, 0.0177, 0.1052}, + {0.0028, 0.0312, 0.0650}, + {0.1848, 0.0762, 0.0360}, + {0.0944, 0.0360, 0.0889}, + {0.0897, 0.0506, -0.0364}, + {-0.0020, 0.1203, 0.0284}, + {0.0855, 0.0118, 0.0283}, + {-0.0539, 0.0658, 0.1047}, + {-0.0057, 0.0116, 0.0700}, + {-0.0412, 0.0281, -0.0039}, + {0.1106, 0.1171, 0.1220}, + {-0.0248, 0.0682, -0.0481}, + {0.0815, 0.0846, 0.1207}, + {-0.0120, -0.0055, -0.0867}, + {-0.0749, -0.0634, -0.0456}, + {-0.1418, -0.1457, -0.1259}, +}; + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 +const float sdxl_latent_rgb_proj[4][3] = { + {0.3651, 0.4232, 0.4341}, + {-0.2533, -0.0042, 0.1068}, + {0.1076, 0.1111, -0.0362}, + {-0.3165, -0.2492, -0.2188}}; + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 +const float sd_latent_rgb_proj[4][3]{ + {0.3512, 0.2297, 0.3227}, + {0.3250, 0.4974, 0.2350}, + {-0.2829, 0.1762, 0.2721}, + {-0.2120, -0.2616, -0.7177}}; + +void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version) { + const int channel = 3; + int width = latents->ne[0]; + int height = latents->ne[1]; + int dim = latents->ne[2]; + + const float (*latent_rgb_proj)[channel]; + + if (dim == 16) { + // 16 channels VAE -> Flux or SD3 + + if (sd_version_is_sd3(version)) { + latent_rgb_proj = sd3_latent_rgb_proj; + } else if (sd_version_is_flux(version)) { + latent_rgb_proj = flux_latent_rgb_proj; + } else { + // unknown model + return; + } + + } else if (dim == 4) { + // 4 channels VAE + if (version == VERSION_SDXL) { + latent_rgb_proj = sdxl_latent_rgb_proj; + } else if (version == VERSION_SD1 || version == VERSION_SD2) { + latent_rgb_proj = sd_latent_rgb_proj; + } else { + // unknown model + return; + } + } else { + // unknown latent space + return; + } + uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); + int data_head = 0; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + int latent_id = (i * latents->nb[0] + j * latents->nb[1]); + float r = 0, g = 0, b = 0; + for (int d = 0; d < dim; d++) { + float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]); + r += value * latent_rgb_proj[d][0]; + g += value * latent_rgb_proj[d][1]; + b += value * latent_rgb_proj[d][2]; + } + + // change range + r = r * .5 + .5; + g = g * .5 + .5; + b = b * .5 + .5; + + // clamp rgb values to [0,1] range + r = r >= 0 ? r <= 1 ? r : 1 : 0; + g = g >= 0 ? g <= 1 ? g : 1 : 0; + b = b >= 0 ? b <= 1 ? b : 1 : 0; + + data[data_head++] = (uint8_t)(r * 255.); + data[data_head++] = (uint8_t)(g * 255.); + data[data_head++] = (uint8_t)(b * 255.); + } + } + stbi_write_png("latent-preview.png", width, height, channel, data, 0); + free(data); +} + int main(int argc, const char* argv[]) { SDParams params; @@ -993,7 +1115,7 @@ int main(int argc, const char* argv[]) { params.input_id_images_path.c_str(), }; - results = generate_image(sd_ctx, &img_gen_params); + results = generate_image(sd_ctx, &img_gen_params, &step_callback); expected_num_results = params.batch_count; } else if (params.mode == VID_GEN) { sd_vid_gen_params_t vid_gen_params = { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index c5448f927..432b194d2 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -864,7 +864,8 @@ class StableDiffusionGGML { int start_merge_step, SDCondition id_cond, std::vector ref_latents = {}, - ggml_tensor* denoise_mask = nullptr) { + ggml_tensor* denoise_mask = nullptr, + std::function step_callback = nullptr) { std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); float cfg_scale = guidance.txt_cfg; @@ -1096,6 +1097,9 @@ class StableDiffusionGGML { } } + if (step_callback != nullptr) { + step_callback(step, denoised, version); + } return denoised; }; @@ -1544,8 +1548,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, bool normalize_input, std::string input_id_images_path, std::vector ref_latents, - ggml_tensor* concat_latent = NULL, - ggml_tensor* denoise_mask = NULL) { + ggml_tensor* concat_latent = NULL, + ggml_tensor* denoise_mask = NULL, + std::function step_callback = nullptr) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1814,7 +1819,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, start_merge_step, id_cond, ref_latents, - denoise_mask); + denoise_mask, + step_callback); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1888,7 +1894,7 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx, return init_latent; } -sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { +sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params,step_callback_t step_callback) { int width = sd_img_gen_params->width; int height = sd_img_gen_params->height; if (sd_version_is_dit(sd_ctx->sd->version)) { @@ -2097,7 +2103,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g sd_img_gen_params->input_id_images_path, ref_latents, concat_latent, - denoise_mask); + denoise_mask, + step_callback); size_t t2 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index e87ac2ce2..7cced3759 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -228,9 +228,11 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); +typedef void (*step_callback_t)(int, struct ggml_tensor*, enum SDVersion); + SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params); SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params); -SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params); +SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, step_callback_t step_callback); SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params); SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params); // broken From de9c49291f42af0d9d007cef859afa06a5e1665b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:43 +0200 Subject: [PATCH 02/45] fix posix compile --- examples/cli/main.cpp | 2 +- stable-diffusion.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index ef7877aa1..1a27f8848 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1115,7 +1115,7 @@ int main(int argc, const char* argv[]) { params.input_id_images_path.c_str(), }; - results = generate_image(sd_ctx, &img_gen_params, &step_callback); + results = generate_image(sd_ctx, &img_gen_params, (step_callback_t)step_callback); expected_num_results = params.batch_count; } else if (params.mode == VID_GEN) { sd_vid_gen_params_t vid_gen_params = { diff --git a/stable-diffusion.h b/stable-diffusion.h index 7cced3759..8ffafc1f7 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -228,7 +228,7 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); -typedef void (*step_callback_t)(int, struct ggml_tensor*, enum SDVersion); +typedef void (*step_callback_t)(int, struct ggml_tensor*, int); SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params); SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params); From ee4aef89f1b10b0ad8b1d030357cc0df3beb74b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:44 +0200 Subject: [PATCH 03/45] move latent preview code to a separate file --- examples/cli/main.cpp | 83 +++---------------------------------------- latent-preview.h | 83 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 79 deletions(-) create mode 100644 latent-preview.h diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 1a27f8848..9e1084e3e 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -15,6 +15,8 @@ // #include "preprocessing.hpp" #include "stable-diffusion.h" +#include "latent-preview.h" + #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_STATIC #include "stb_image.h" @@ -755,59 +757,6 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { fflush(out_stream); } -// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 -const float flux_latent_rgb_proj[16][3] = { - {-0.0346, 0.0244, 0.0681}, - {0.0034, 0.0210, 0.0687}, - {0.0275, -0.0668, -0.0433}, - {-0.0174, 0.0160, 0.0617}, - {0.0859, 0.0721, 0.0329}, - {0.0004, 0.0383, 0.0115}, - {0.0405, 0.0861, 0.0915}, - {-0.0236, -0.0185, -0.0259}, - {-0.0245, 0.0250, 0.1180}, - {0.1008, 0.0755, -0.0421}, - {-0.0515, 0.0201, 0.0011}, - {0.0428, -0.0012, -0.0036}, - {0.0817, 0.0765, 0.0749}, - {-0.1264, -0.0522, -0.1103}, - {-0.0280, -0.0881, -0.0499}, - {-0.1262, -0.0982, -0.0778}}; - -// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 -const float sd3_latent_rgb_proj[16][3] = { - {-0.0645, 0.0177, 0.1052}, - {0.0028, 0.0312, 0.0650}, - {0.1848, 0.0762, 0.0360}, - {0.0944, 0.0360, 0.0889}, - {0.0897, 0.0506, -0.0364}, - {-0.0020, 0.1203, 0.0284}, - {0.0855, 0.0118, 0.0283}, - {-0.0539, 0.0658, 0.1047}, - {-0.0057, 0.0116, 0.0700}, - {-0.0412, 0.0281, -0.0039}, - {0.1106, 0.1171, 0.1220}, - {-0.0248, 0.0682, -0.0481}, - {0.0815, 0.0846, 0.1207}, - {-0.0120, -0.0055, -0.0867}, - {-0.0749, -0.0634, -0.0456}, - {-0.1418, -0.1457, -0.1259}, -}; - -// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 -const float sdxl_latent_rgb_proj[4][3] = { - {0.3651, 0.4232, 0.4341}, - {-0.2533, -0.0042, 0.1068}, - {0.1076, 0.1111, -0.0362}, - {-0.3165, -0.2492, -0.2188}}; - -// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 -const float sd_latent_rgb_proj[4][3]{ - {0.3512, 0.2297, 0.3227}, - {0.3250, 0.4974, 0.2350}, - {-0.2829, 0.1762, 0.2721}, - {-0.2120, -0.2616, -0.7177}}; - void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version) { const int channel = 3; int width = latents->ne[0]; @@ -843,33 +792,9 @@ void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version return; } uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); - int data_head = 0; - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - int latent_id = (i * latents->nb[0] + j * latents->nb[1]); - float r = 0, g = 0, b = 0; - for (int d = 0; d < dim; d++) { - float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]); - r += value * latent_rgb_proj[d][0]; - g += value * latent_rgb_proj[d][1]; - b += value * latent_rgb_proj[d][2]; - } - - // change range - r = r * .5 + .5; - g = g * .5 + .5; - b = b * .5 + .5; - - // clamp rgb values to [0,1] range - r = r >= 0 ? r <= 1 ? r : 1 : 0; - g = g >= 0 ? g <= 1 ? g : 1 : 0; - b = b >= 0 ? b <= 1 ? b : 1 : 0; + + preview_latent_image(data, latents, latent_rgb_proj, width, height, dim); - data[data_head++] = (uint8_t)(r * 255.); - data[data_head++] = (uint8_t)(g * 255.); - data[data_head++] = (uint8_t)(b * 255.); - } - } stbi_write_png("latent-preview.png", width, height, channel, data, 0); free(data); } diff --git a/latent-preview.h b/latent-preview.h new file mode 100644 index 000000000..5457c47ed --- /dev/null +++ b/latent-preview.h @@ -0,0 +1,83 @@ + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 +const float flux_latent_rgb_proj[16][3] = { + {-0.0346, 0.0244, 0.0681}, + {0.0034, 0.0210, 0.0687}, + {0.0275, -0.0668, -0.0433}, + {-0.0174, 0.0160, 0.0617}, + {0.0859, 0.0721, 0.0329}, + {0.0004, 0.0383, 0.0115}, + {0.0405, 0.0861, 0.0915}, + {-0.0236, -0.0185, -0.0259}, + {-0.0245, 0.0250, 0.1180}, + {0.1008, 0.0755, -0.0421}, + {-0.0515, 0.0201, 0.0011}, + {0.0428, -0.0012, -0.0036}, + {0.0817, 0.0765, 0.0749}, + {-0.1264, -0.0522, -0.1103}, + {-0.0280, -0.0881, -0.0499}, + {-0.1262, -0.0982, -0.0778}}; + +// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 +const float sd3_latent_rgb_proj[16][3] = { + {-0.0645, 0.0177, 0.1052}, + {0.0028, 0.0312, 0.0650}, + {0.1848, 0.0762, 0.0360}, + {0.0944, 0.0360, 0.0889}, + {0.0897, 0.0506, -0.0364}, + {-0.0020, 0.1203, 0.0284}, + {0.0855, 0.0118, 0.0283}, + {-0.0539, 0.0658, 0.1047}, + {-0.0057, 0.0116, 0.0700}, + {-0.0412, 0.0281, -0.0039}, + {0.1106, 0.1171, 0.1220}, + {-0.0248, 0.0682, -0.0481}, + {0.0815, 0.0846, 0.1207}, + {-0.0120, -0.0055, -0.0867}, + {-0.0749, -0.0634, -0.0456}, + {-0.1418, -0.1457, -0.1259}, +}; + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 +const float sdxl_latent_rgb_proj[4][3] = { + {0.3651, 0.4232, 0.4341}, + {-0.2533, -0.0042, 0.1068}, + {0.1076, 0.1111, -0.0362}, + {-0.3165, -0.2492, -0.2188}}; + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 +const float sd_latent_rgb_proj[4][3]{ + {0.3512, 0.2297, 0.3227}, + {0.3250, 0.4974, 0.2350}, + {-0.2829, 0.1762, 0.2721}, + {-0.2120, -0.2616, -0.7177}}; + +void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) { + size_t buffer_head = 0; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + size_t latent_id = (i * latents->nb[0] + j * latents->nb[1]); + float r = 0, g = 0, b = 0; + for (int d = 0; d < dim; d++) { + float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]); + r += value * latent_rgb_proj[d][0]; + g += value * latent_rgb_proj[d][1]; + b += value * latent_rgb_proj[d][2]; + } + + // change range + r = r * .5f + .5f; + g = g * .5f + .5f; + b = b * .5f + .5f; + + // clamp rgb values to [0,1] range + r = r >= 0 ? r <= 1 ? r : 1 : 0; + g = g >= 0 ? g <= 1 ? g : 1 : 0; + b = b >= 0 ? b <= 1 ? b : 1 : 0; + + buffer[buffer_head++] = (uint8_t)(r * 255); + buffer[buffer_head++] = (uint8_t)(g * 255); + buffer[buffer_head++] = (uint8_t)(b * 255); + } + } +} \ No newline at end of file From 75a9abdf709aeff84dc92e33f7df4e979fedf055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:46 +0200 Subject: [PATCH 04/45] Latent preview support for img2img and img2vid --- examples/cli/main.cpp | 2 +- stable-diffusion.cpp | 9 ++++++--- stable-diffusion.h | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 9e1084e3e..168b1eb66 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1058,7 +1058,7 @@ int main(int argc, const char* argv[]) { params.augmentation_level, }; - results = generate_video(sd_ctx, &vid_gen_params); + results = generate_video(sd_ctx, &vid_gen_params, (step_callback_t)step_callback); expected_num_results = params.video_frames; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 432b194d2..645387b83 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1894,7 +1894,7 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx, return init_latent; } -sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params,step_callback_t step_callback) { +sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, step_callback_t step_callback) { int width = sd_img_gen_params->width; int height = sd_img_gen_params->height; if (sd_version_is_dit(sd_ctx->sd->version)) { @@ -2113,7 +2113,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g return result_images; } -SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) { +SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, step_callback_t step_callback) { if (sd_ctx == NULL || sd_vid_gen_params == NULL) { return NULL; } @@ -2195,7 +2195,10 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s sd_vid_gen_params->sample_method, sigmas, -1, - SDCondition(NULL, NULL, NULL)); + SDCondition(NULL, NULL, NULL), + {}, + NULL, + step_callback); int64_t t2 = ggml_time_ms(); LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000); diff --git a/stable-diffusion.h b/stable-diffusion.h index 8ffafc1f7..16b8ac3c4 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -235,7 +235,7 @@ SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_para SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, step_callback_t step_callback); SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params); -SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params); // broken +SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, step_callback_t step_callback); // broken typedef struct upscaler_ctx_t upscaler_ctx_t; From 8dcb814059f932eb460acdc954955dc168988510 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:48 +0200 Subject: [PATCH 05/45] add latent-preview to .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 38fe570df..2e520df2c 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ test/ *.gguf output*.png models* -*.log \ No newline at end of file +*.log +latent-preview.png From ef6207882ce5c461f8ecc8df1102f4db842ef792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:49 +0200 Subject: [PATCH 06/45] Refactor latent preview + support tae/vae preview --- .gitignore | 2 +- examples/cli/main.cpp | 87 +++++++++++++-------------- stable-diffusion.cpp | 137 +++++++++++++++++++++++++++++++++++++++--- stable-diffusion.h | 13 +++- 4 files changed, 182 insertions(+), 57 deletions(-) diff --git a/.gitignore b/.gitignore index 2e520df2c..552d5673c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,4 @@ test/ output*.png models* *.log -latent-preview.png +preview.png diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 168b1eb66..fea609424 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -9,14 +9,9 @@ #include #include -#include "model.h" - - // #include "preprocessing.hpp" #include "stable-diffusion.h" -#include "latent-preview.h" - #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_STATIC #include "stb_image.h" @@ -39,6 +34,13 @@ const char* modes_str[] = { }; #define SD_ALL_MODES_STR "img_gen, vid_gen, convert" +const char* previews_str[] = { + "none", + "proj", + "tae", + "vae", +}; + enum SDMode { IMG_GEN, VID_GEN, @@ -116,6 +118,11 @@ struct SDParams { bool chroma_use_dit_mask = true; bool chroma_use_t5_mask = false; int chroma_t5_mask_pad = 1; + + sd_preview_policy_t preview_method = SD_PREVIEW_NONE; + int preview_interval = 1; + std::string preview_path = "preview.png"; + bool taesd_preview = false; }; void print_params(SDParams params) { @@ -399,6 +406,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"-o", "--output", "", ¶ms.output_path}, {"-p", "--prompt", "", ¶ms.prompt}, {"-n", "--negative-prompt", "", ¶ms.negative_prompt}, + {"", "--preview-path", "", ¶ms.preview_path}, {"", "--upscale-model", "", ¶ms.esrgan_path}, }; @@ -412,6 +420,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--clip-skip", "", ¶ms.clip_skip}, {"-b", "--batch-count", "", ¶ms.batch_count}, {"", "--chroma-t5-mask-pad", "", ¶ms.chroma_t5_mask_pad}, + {"", "--preview-interval", "", ¶ms.preview_interval}, }; options.float_options = { @@ -442,6 +451,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--color", "", true, ¶ms.color}, {"", "--chroma-disable-dit-mask", "", false, ¶ms.chroma_use_dit_mask}, {"", "--chroma-enable-t5-mask", "", true, ¶ms.chroma_use_t5_mask}, + {"", "--taesd-preview-only", "", false, ¶ms.taesd_preview}, }; auto on_mode_arg = [&](int argc, const char** argv, int index) { @@ -572,6 +582,26 @@ void parse_args(int argc, const char** argv, SDParams& params) { return 1; }; + auto on_preview_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* preview = argv[index]; + int preview_method = -1; + for (int m = 0; m < N_PREVIEWS; m++) { + if (!strcmp(preview, previews_str[m])) { + preview_method = m; + } + } + if (preview_method == -1) { + fprintf(stderr, "error: preview method %s\n", + preview); + return -1; + } + params.preview_method = (sd_preview_policy_t)preview_method; + return 1; + }; + options.manual_options = { {"-M", "--mode", "", on_mode_arg}, {"", "--type", "", on_type_arg}, @@ -582,6 +612,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--skip-layers", "", on_skip_layers_arg}, {"-r", "--ref-image", "", on_ref_image_arg}, {"-h", "--help", "", on_help_arg}, + {"", "--preview", "", on_preview_arg}, }; if (!parse_options(argc, argv, options)) { @@ -757,52 +788,17 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { fflush(out_stream); } -void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version) { - const int channel = 3; - int width = latents->ne[0]; - int height = latents->ne[1]; - int dim = latents->ne[2]; - - const float (*latent_rgb_proj)[channel]; - - if (dim == 16) { - // 16 channels VAE -> Flux or SD3 - - if (sd_version_is_sd3(version)) { - latent_rgb_proj = sd3_latent_rgb_proj; - } else if (sd_version_is_flux(version)) { - latent_rgb_proj = flux_latent_rgb_proj; - } else { - // unknown model - return; - } - - } else if (dim == 4) { - // 4 channels VAE - if (version == VERSION_SDXL) { - latent_rgb_proj = sdxl_latent_rgb_proj; - } else if (version == VERSION_SD1 || version == VERSION_SD2) { - latent_rgb_proj = sd_latent_rgb_proj; - } else { - // unknown model - return; - } - } else { - // unknown latent space - return; - } - uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); - - preview_latent_image(data, latents, latent_rgb_proj, width, height, dim); +const char* preview_path; - stbi_write_png("latent-preview.png", width, height, channel, data, 0); - free(data); +void step_callback(int step, sd_image_t image) { + stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0); } int main(int argc, const char* argv[]) { SDParams params; parse_args(argc, argv, params); + preview_path = params.preview_path.c_str(); sd_guidance_params_t guidance_params = {params.cfg_scale, params.img_cfg_scale, @@ -958,6 +954,7 @@ int main(int argc, const char* argv[]) { params.control_net_cpu, params.vae_on_cpu, params.diffusion_flash_attn, + params.taesd_preview, params.diffusion_conv_direct, params.vae_conv_direct, params.chroma_use_dit_mask, @@ -1040,7 +1037,7 @@ int main(int argc, const char* argv[]) { params.input_id_images_path.c_str(), }; - results = generate_image(sd_ctx, &img_gen_params, (step_callback_t)step_callback); + results = generate_image(sd_ctx, &img_gen_params, params.preview_method, params.preview_interval,(step_callback_t)step_callback); expected_num_results = params.batch_count; } else if (params.mode == VID_GEN) { sd_vid_gen_params_t vid_gen_params = { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 645387b83..ba6eeedf4 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -20,6 +20,8 @@ #define STB_IMAGE_STATIC #include "stb_image.h" +#include "latent-preview.h" + // #define STB_IMAGE_WRITE_IMPLEMENTATION // #define STB_IMAGE_WRITE_STATIC // #include "stb_image_write.h" @@ -386,7 +388,7 @@ class StableDiffusionGGML { diffusion_model->alloc_params_buffer(); diffusion_model->get_param_tensors(tensors); - if (!use_tiny_autoencoder) { + if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) { if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_INFO("VAE Autoencoder: Using CPU backend"); vae_backend = ggml_backend_cpu_init(); @@ -405,7 +407,8 @@ class StableDiffusionGGML { } first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); - } else { + } + if (use_tiny_autoencoder) { tae_first_stage = std::make_shared(backend, model_loader.tensor_storages_types, "decoder.layers", @@ -509,9 +512,10 @@ class StableDiffusionGGML { size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size(); size_t unet_params_mem_size = diffusion_model->get_params_buffer_size(); size_t vae_params_mem_size = 0; - if (!use_tiny_autoencoder) { + if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) { vae_params_mem_size = first_stage_model->get_params_buffer_size(); - } else { + } + if (use_tiny_autoencoder) { if (!tae_first_stage->load_from_file(taesd_path)) { return false; } @@ -663,6 +667,7 @@ class StableDiffusionGGML { LOG_DEBUG("finished loaded file"); ggml_free(ctx); + use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only; return true; } @@ -848,6 +853,100 @@ class StableDiffusionGGML { LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0); return {c_crossattn, y, c_concat}; } +void preview_image(ggml_context* work_ctx, + int step, + struct ggml_tensor* latents, + enum SDVersion version, + sd_preview_policy_t preview_mode, + ggml_tensor* result, + std::function step_callback) { + const size_t channel = 3; + size_t width = latents->ne[0]; + size_t height = latents->ne[1]; + size_t dim = latents->ne[2]; + if (preview_mode == SD_PREVIEW_PROJ) { + const float (*latent_rgb_proj)[channel]; + + if (dim == 16) { + // 16 channels VAE -> Flux or SD3 + + if (sd_version_is_sd3(version)) { + latent_rgb_proj = sd3_latent_rgb_proj; + } else if (sd_version_is_flux(version)) { + latent_rgb_proj = flux_latent_rgb_proj; + } else { + // unknown model + return; + } + + } else if (dim == 4) { + // 4 channels VAE + if (version == VERSION_SDXL) { + latent_rgb_proj = sdxl_latent_rgb_proj; + } else if (version == VERSION_SD1 || version == VERSION_SD2) { + latent_rgb_proj = sd_latent_rgb_proj; + } else { + // unknown model + return; + } + } else { + // unknown latent space + return; + } + uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); + + preview_latent_image(data, latents, latent_rgb_proj, width, height, dim); + sd_image_t image = { + width, + height, + channel, + data}; + step_callback(step, image); + free(image.data); + } else { + if (preview_mode == SD_PREVIEW_VAE) { + ggml_tensor_scale(latents, 1.0f / scale_factor); + if (vae_tiling) { + // split latent in 32x32 tiles and compute in several steps + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + first_stage_model->compute(n_threads, in, true, &out); + }; + sd_tiling(latents, result, 8, 32, 0.5f, on_tiling); + } else { + first_stage_model->compute(n_threads, latents, true, &result); + } + first_stage_model->free_compute_buffer(); + + ggml_tensor_scale_output(result); + } else if (preview_mode == SD_PREVIEW_TAE) { + if (tae_first_stage == nullptr) { + LOG_WARN("TAE not found for preview"); + return; + } + if (vae_tiling) { + // split latent in 64x64 tiles and compute in several steps + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + tae_first_stage->compute(n_threads, in, true, &out); + }; + sd_tiling(latents, result, 8, 64, 0.5f, on_tiling); + } else { + tae_first_stage->compute(n_threads, latents, true, &result); + } + tae_first_stage->free_compute_buffer(); + } else { + return; + } + ggml_tensor_clamp(result, 0.0f, 1.0f); + sd_image_t image = { + width * 8, + height * 8, + channel, + sd_tensor_to_image(result)}; + ggml_tensor_scale(result, 0); + step_callback(step, image); + free(image.data); + } + } ggml_tensor* sample(ggml_context* work_ctx, ggml_tensor* init_latent, @@ -865,7 +964,9 @@ class StableDiffusionGGML { SDCondition id_cond, std::vector ref_latents = {}, ggml_tensor* denoise_mask = nullptr, - std::function step_callback = nullptr) { + sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + int preview_interval = 1, + std::function step_callback = nullptr) { std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); float cfg_scale = guidance.txt_cfg; @@ -926,6 +1027,15 @@ class StableDiffusionGGML { } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* preview_tensor = NULL; + if (preview_mode != SD_PREVIEW_PROJ) { + preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, + (denoised->ne[0] * 8), + (denoised->ne[1] * 8), + 3, + denoised->ne[3]); + } + auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { if (step == 1) { pretty_progress(0, (int)steps, 0); @@ -1098,7 +1208,9 @@ class StableDiffusionGGML { } if (step_callback != nullptr) { - step_callback(step, denoised, version); + if (step % preview_interval == 0) { + preview_image(work_ctx, step, denoised, version, preview_mode, preview_tensor, step_callback); + } } return denoised; }; @@ -1550,7 +1662,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, std::vector ref_latents, ggml_tensor* concat_latent = NULL, ggml_tensor* denoise_mask = NULL, - std::function step_callback = nullptr) { + sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + int preview_interval = 1, + std::function step_callback = nullptr) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1820,6 +1934,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, id_cond, ref_latents, denoise_mask, + preview_mode, + preview_interval, step_callback); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); @@ -1894,7 +2010,7 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx, return init_latent; } -sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, step_callback_t step_callback) { +sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, sd_preview_policy_t preview_mode, int preview_interval, step_callback_t step_callback) { int width = sd_img_gen_params->width; int height = sd_img_gen_params->height; if (sd_version_is_dit(sd_ctx->sd->version)) { @@ -2104,6 +2220,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g ref_latents, concat_latent, denoise_mask, + preview_mode, + preview_interval, step_callback); size_t t2 = ggml_time_ms(); @@ -2198,7 +2316,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s SDCondition(NULL, NULL, NULL), {}, NULL, - step_callback); + (sd_preview_policy_t)0, 1, + NULL); int64_t t2 = ggml_time_ms(); LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000); diff --git a/stable-diffusion.h b/stable-diffusion.h index 16b8ac3c4..d7522a46e 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -111,6 +111,14 @@ enum sd_log_level_t { SD_LOG_ERROR }; +enum sd_preview_policy_t { + SD_PREVIEW_NONE, + SD_PREVIEW_PROJ, + SD_PREVIEW_TAE, + SD_PREVIEW_VAE, + N_PREVIEWS +}; + typedef struct { const char* model_path; const char* clip_l_path; @@ -134,6 +142,7 @@ typedef struct { bool keep_control_net_on_cpu; bool keep_vae_on_cpu; bool diffusion_flash_attn; + bool tae_preview_only; bool diffusion_conv_direct; bool vae_conv_direct; bool chroma_use_dit_mask; @@ -228,11 +237,11 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); -typedef void (*step_callback_t)(int, struct ggml_tensor*, int); +typedef void (*step_callback_t)(int, sd_image_t); SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params); SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params); -SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, step_callback_t step_callback); +SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, sd_preview_policy_t preview_mode, int preview_interval, step_callback_t step_callback); SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params); SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, step_callback_t step_callback); // broken From 2cedeb569e639fe282835df5e0efb83e11ee69ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:50 +0200 Subject: [PATCH 07/45] update usage --- examples/cli/main.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index fea609424..3e7242926 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -182,6 +182,8 @@ void print_params(SDParams params) { printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false"); printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false"); printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad); + printf(" preview_mode: %d\n", previews_str[params.preview_method]); + printf(" preview_interval: %d\n", params.preview_interval); } void print_usage(int argc, const char* argv[]) { @@ -198,7 +200,8 @@ void print_usage(int argc, const char* argv[]) { printf(" --clip_g path to the clip-g text encoder\n"); printf(" --t5xxl path to the t5xxl text encoder\n"); printf(" --vae [VAE] path to vae\n"); - printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); + printf(" --taesd [TAESD] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); + printf(" --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[SD_PREVIEW_TAE]); printf(" --control-net [CONTROL_PATH] path to control net model\n"); printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n"); printf(" --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings\n"); @@ -254,6 +257,10 @@ void print_usage(int argc, const char* argv[]) { printf(" This might crash if it is not supported by the backend.\n"); printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); + printf(" --preview {%s,%s,%s,%s} preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[SD_PREVIEW_NONE]); + printf(" %s is the fastest\n", previews_str[SD_PREVIEW_PROJ]); + printf(" --preview-interval [N] How often to save the image preview"); + printf(" --preview-path [PATH} path to write preview image to (default: ./preview.png)\n"); printf(" --color colors the logging tags according to level\n"); printf(" --chroma-disable-dit-mask disable dit mask for chroma\n"); printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n"); From be0a442cefb201d344a6e7fd6c2e58ab35fdfd67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:51 +0200 Subject: [PATCH 08/45] Fix build + add warning --- stable-diffusion.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ba6eeedf4..d0b776687 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -860,10 +860,10 @@ void preview_image(ggml_context* work_ctx, sd_preview_policy_t preview_mode, ggml_tensor* result, std::function step_callback) { - const size_t channel = 3; - size_t width = latents->ne[0]; - size_t height = latents->ne[1]; - size_t dim = latents->ne[2]; + const uint32_t channel = 3; + uint32_t width = latents->ne[0]; + uint32_t height = latents->ne[1]; + uint32_t dim = latents->ne[2]; if (preview_mode == SD_PREVIEW_PROJ) { const float (*latent_rgb_proj)[channel]; @@ -875,6 +875,7 @@ void preview_image(ggml_context* work_ctx, } else if (sd_version_is_flux(version)) { latent_rgb_proj = flux_latent_rgb_proj; } else { + LOG_WARN("No latent to RGB projection known for this model"); // unknown model return; } @@ -887,9 +888,11 @@ void preview_image(ggml_context* work_ctx, latent_rgb_proj = sd_latent_rgb_proj; } else { // unknown model + LOG_WARN("No latent to RGB projection known for this model"); return; } } else { + LOG_WARN("No latent to RGB projection known for this model"); // unknown latent space return; } From 31b0fdd01aa008d1801535446ef7c20842777ec8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:53 +0200 Subject: [PATCH 09/45] Disable preview by default in sdcpp too --- stable-diffusion.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index d0b776687..e197ee84d 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -967,7 +967,7 @@ void preview_image(ggml_context* work_ctx, SDCondition id_cond, std::vector ref_latents = {}, ggml_tensor* denoise_mask = nullptr, - sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, int preview_interval = 1, std::function step_callback = nullptr) { std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); @@ -1665,7 +1665,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, std::vector ref_latents, ggml_tensor* concat_latent = NULL, ggml_tensor* denoise_mask = NULL, - sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, int preview_interval = 1, std::function step_callback = nullptr) { if (seed < 0) { From 95fd31ccb7e639bbd0130bc32e7ff20cc8515e6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:54 +0200 Subject: [PATCH 10/45] Done not preload preview tensor when preview is disabled. --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e197ee84d..b8c841cd2 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1031,7 +1031,7 @@ void preview_image(ggml_context* work_ctx, struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* preview_tensor = NULL; - if (preview_mode != SD_PREVIEW_PROJ) { + if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) { preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, (denoised->ne[0] * 8), (denoised->ne[1] * 8), From cbd8c996fad13039f254d70740ee10ab01d4cd9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:55 +0200 Subject: [PATCH 11/45] Fix VAE preview darkening --- stable-diffusion.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index b8c841cd2..77c329293 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -919,6 +919,7 @@ void preview_image(ggml_context* work_ctx, first_stage_model->compute(n_threads, latents, true, &result); } first_stage_model->free_compute_buffer(); + ggml_tensor_scale(latents, scale_factor); ggml_tensor_scale_output(result); } else if (preview_mode == SD_PREVIEW_TAE) { From c3d72c04bc7267195e9727420c0edd4a2f22bea4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:56 +0200 Subject: [PATCH 12/45] Increase context memory when loading multiple auto encoders --- stable-diffusion.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 77c329293..4e777a2d7 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -2042,6 +2042,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } + if (sd_ctx->sd->first_stage_model != nullptr && sd_ctx->sd->tae_first_stage != nullptr) { + params.mem_size *= 2; + } params.mem_size += width * height * 3 * sizeof(float) * 3; params.mem_size += width * height * 3 * sizeof(float) * 3 * sd_img_gen_params->ref_images_count; params.mem_size *= sd_img_gen_params->batch_count; From 8059ac343d7cda24cf3c9666c00606dcd2e81b44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:58 +0200 Subject: [PATCH 13/45] Increase context memory when previewing with auto encoder instead --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 4e777a2d7..4be0c9b76 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -2042,7 +2042,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } - if (sd_ctx->sd->first_stage_model != nullptr && sd_ctx->sd->tae_first_stage != nullptr) { + if (preview_mode!=SD_PREVIEW_NONE && preview_mode!=SD_PREVIEW_PROJ) { params.mem_size *= 2; } params.mem_size += width * height * 3 * sizeof(float) * 3; From 8e6024f81a89b2c36f325bf5cec8af880d25c067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:13:59 +0200 Subject: [PATCH 14/45] fix compile warnings --- latent-preview.h | 80 ++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/latent-preview.h b/latent-preview.h index 5457c47ed..ca4d132f3 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -1,56 +1,56 @@ // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 const float flux_latent_rgb_proj[16][3] = { - {-0.0346, 0.0244, 0.0681}, - {0.0034, 0.0210, 0.0687}, - {0.0275, -0.0668, -0.0433}, - {-0.0174, 0.0160, 0.0617}, - {0.0859, 0.0721, 0.0329}, - {0.0004, 0.0383, 0.0115}, - {0.0405, 0.0861, 0.0915}, - {-0.0236, -0.0185, -0.0259}, - {-0.0245, 0.0250, 0.1180}, - {0.1008, 0.0755, -0.0421}, - {-0.0515, 0.0201, 0.0011}, - {0.0428, -0.0012, -0.0036}, - {0.0817, 0.0765, 0.0749}, - {-0.1264, -0.0522, -0.1103}, - {-0.0280, -0.0881, -0.0499}, - {-0.1262, -0.0982, -0.0778}}; + {-0.0346f, 0.0244f, 0.0681f}, + {0.0034f, 0.0210f, 0.0687f}, + {0.0275f, -0.0668f, -0.0433f}, + {-0.0174f, 0.0160f, 0.0617f}, + {0.0859f, 0.0721f, 0.0329f}, + {0.0004f, 0.0383f, 0.0115f}, + {0.0405f, 0.0861f, 0.0915f}, + {-0.0236f, -0.0185f, -0.0259f}, + {-0.0245f, 0.0250f, 0.1180f}, + {0.1008f, 0.0755f, -0.0421f}, + {-0.0515f, 0.0201f, 0.0011f}, + {0.0428f, -0.0012f, -0.0036f}, + {0.0817f, 0.0765f, 0.0749f}, + {-0.1264f, -0.0522f, -0.1103f}, + {-0.0280f, -0.0881f, -0.0499f}, + {-0.1262f, -0.0982f, -0.0778f}}; // https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 const float sd3_latent_rgb_proj[16][3] = { - {-0.0645, 0.0177, 0.1052}, - {0.0028, 0.0312, 0.0650}, - {0.1848, 0.0762, 0.0360}, - {0.0944, 0.0360, 0.0889}, - {0.0897, 0.0506, -0.0364}, - {-0.0020, 0.1203, 0.0284}, - {0.0855, 0.0118, 0.0283}, - {-0.0539, 0.0658, 0.1047}, - {-0.0057, 0.0116, 0.0700}, - {-0.0412, 0.0281, -0.0039}, - {0.1106, 0.1171, 0.1220}, - {-0.0248, 0.0682, -0.0481}, - {0.0815, 0.0846, 0.1207}, - {-0.0120, -0.0055, -0.0867}, - {-0.0749, -0.0634, -0.0456}, - {-0.1418, -0.1457, -0.1259}, + {-0.0645f, 0.0177f, 0.1052f}, + {0.0028f, 0.0312f, 0.0650f}, + {0.1848f, 0.0762f, 0.0360f}, + {0.0944f, 0.0360f, 0.0889f}, + {0.0897f, 0.0506f, -0.0364f}, + {-0.0020f, 0.1203f, 0.0284f}, + {0.0855f, 0.0118f, 0.0283f}, + {-0.0539f, 0.0658f, 0.1047f}, + {-0.0057f, 0.0116f, 0.0700f}, + {-0.0412f, 0.0281f, -0.0039f}, + {0.1106f, 0.1171f, 0.1220f}, + {-0.0248f, 0.0682f, -0.0481f}, + {0.0815f, 0.0846f, 0.1207f}, + {-0.0120f, -0.0055f, -0.0867f}, + {-0.0749f, -0.0634f, -0.0456f}, + {-0.1418f, -0.1457f, -0.1259f}, }; // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 const float sdxl_latent_rgb_proj[4][3] = { - {0.3651, 0.4232, 0.4341}, - {-0.2533, -0.0042, 0.1068}, - {0.1076, 0.1111, -0.0362}, - {-0.3165, -0.2492, -0.2188}}; + {0.3651f, 0.4232f, 0.4341f}, + {-0.2533f, -0.0042f, 0.1068f}, + {0.1076f, 0.1111f, -0.0362f}, + {-0.3165f, -0.2492f, -0.2188f}}; // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 const float sd_latent_rgb_proj[4][3]{ - {0.3512, 0.2297, 0.3227}, - {0.3250, 0.4974, 0.2350}, - {-0.2829, 0.1762, 0.2721}, - {-0.2120, -0.2616, -0.7177}}; + {0.3512f, 0.2297f, 0.3227f}, + {0.3250f, 0.4974f, 0.2350f}, + {-0.2829f, 0.1762f, 0.2721f}, + {-0.2120f, -0.2616f, -0.7177f}}; void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) { size_t buffer_head = 0; From 19ac567924ccbdfe0a238a1ce7fa9f64361d91f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:14:00 +0200 Subject: [PATCH 15/45] fix print-params --- examples/cli/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 3e7242926..a74fb6b55 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -182,7 +182,7 @@ void print_params(SDParams params) { printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false"); printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false"); printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad); - printf(" preview_mode: %d\n", previews_str[params.preview_method]); + printf(" preview_mode: %s\n", previews_str[params.preview_method]); printf(" preview_interval: %d\n", params.preview_interval); } From 430f7d8b6b0d9db88b6903e1cf42fbfad9d7f2d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:14:01 +0200 Subject: [PATCH 16/45] fix preview with unet inpaint models --- stable-diffusion.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 4be0c9b76..da679529f 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -882,9 +882,9 @@ void preview_image(ggml_context* work_ctx, } else if (dim == 4) { // 4 channels VAE - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { latent_rgb_proj = sdxl_latent_rgb_proj; - } else if (version == VERSION_SD1 || version == VERSION_SD2) { + } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { latent_rgb_proj = sd_latent_rgb_proj; } else { // unknown model @@ -2042,7 +2042,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } - if (preview_mode!=SD_PREVIEW_NONE && preview_mode!=SD_PREVIEW_PROJ) { + if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) { params.mem_size *= 2; } params.mem_size += width * height * 3 * sizeof(float) * 3; From 2272068b96133902bf96abd7b5b18b63dd83b4e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:14:02 +0200 Subject: [PATCH 17/45] do not spam pretty progress when using tiled vae/tae as preview --- stable-diffusion.cpp | 28 +++++++++++++++++++++++++--- stable-diffusion.h | 2 ++ util.cpp | 6 ++++++ 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index da679529f..6c19e6ce2 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -71,6 +71,14 @@ void calculate_alphas_cumprod(float* alphas_cumprod, } } +void suppress_pp(int step, int steps, float time, void* data) { + (void)step; + (void)steps; + (void)time; + (void)data; + return; +} + /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { @@ -853,7 +861,16 @@ class StableDiffusionGGML { LOG_DEBUG("computing svd condition graph completed, taking %" PRId64 " ms", t1 - t0); return {c_crossattn, y, c_concat}; } -void preview_image(ggml_context* work_ctx, + + void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) { + sd_progress_cb_t cb = sd_get_progress_callback(); + void* cbd = sd_get_progress_callback_data(); + sd_set_progress_callback((sd_progress_cb_t)suppress_pp, NULL); + sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing); + sd_set_progress_callback(cb, cbd); + } + + void preview_image(ggml_context* work_ctx, int step, struct ggml_tensor* latents, enum SDVersion version, @@ -914,7 +931,8 @@ void preview_image(ggml_context* work_ctx, auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { first_stage_model->compute(n_threads, in, true, &out); }; - sd_tiling(latents, result, 8, 32, 0.5f, on_tiling); + silent_tiling(latents, result, 8, 32, 0.5f, on_tiling); + } else { first_stage_model->compute(n_threads, latents, true, &result); } @@ -932,7 +950,7 @@ void preview_image(ggml_context* work_ctx, auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { tae_first_stage->compute(n_threads, in, true, &out); }; - sd_tiling(latents, result, 8, 64, 0.5f, on_tiling); + silent_tiling(latents, result, 8, 64, 0.5f, on_tiling); } else { tae_first_stage->compute(n_threads, latents, true, &result); } @@ -1210,6 +1228,10 @@ void preview_image(ggml_context* work_ctx, } } } + if (step > 0) { + pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); + // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); + } if (step_callback != nullptr) { if (step % preview_interval == 0) { diff --git a/stable-diffusion.h b/stable-diffusion.h index d7522a46e..a043e1904 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -219,6 +219,8 @@ typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); +SD_API sd_progress_cb_t sd_get_progress_callback(); +SD_API void* sd_get_progress_callback_data(); SD_API int32_t get_num_physical_cores(); SD_API const char* sd_get_system_info(); diff --git a/util.cpp b/util.cpp index 92bc9ef50..18caf8567 100644 --- a/util.cpp +++ b/util.cpp @@ -420,6 +420,12 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) { sd_progress_cb = cb; sd_progress_cb_data = data; } +sd_progress_cb_t sd_get_progress_callback(){ + return sd_progress_cb; +} +void* sd_get_progress_callback_data(){ + return sd_progress_cb_data; +} const char* sd_get_system_info() { static char buffer[1024]; std::stringstream ss; From eeca6979b104ef1ee287551a54f8c6108e6ad472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:14:04 +0200 Subject: [PATCH 18/45] change log level of "processing %i tiles" --- ggml_extend.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 22dd88c94..a5c15de16 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -636,7 +636,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1); on_processing(input_tile, NULL, true); int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap); - LOG_INFO("processing %i tiles", num_tiles); + LOG_DEBUG("processing %i tiles", num_tiles); pretty_progress(1, num_tiles, 0.0f); int tile_count = 1; bool last_y = false, last_x = false; From beb0e91c846aa2325966c886f29322a3e30ef023 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 19:14:05 +0200 Subject: [PATCH 19/45] Refactor preview to match the other callbacks --- examples/cli/main.cpp | 15 ++++++++------- stable-diffusion.cpp | 43 ++++++++++++++++--------------------------- stable-diffusion.h | 12 +++++------- util.cpp | 25 +++++++++++++++++++++++-- util.h | 7 +++++++ 5 files changed, 59 insertions(+), 43 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index a74fb6b55..560565434 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -119,10 +119,10 @@ struct SDParams { bool chroma_use_t5_mask = false; int chroma_t5_mask_pad = 1; - sd_preview_policy_t preview_method = SD_PREVIEW_NONE; - int preview_interval = 1; - std::string preview_path = "preview.png"; - bool taesd_preview = false; + sd_preview_t preview_method = SD_PREVIEW_NONE; + int preview_interval = 1; + std::string preview_path = "preview.png"; + bool taesd_preview = false; }; void print_params(SDParams params) { @@ -605,7 +605,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { preview); return -1; } - params.preview_method = (sd_preview_policy_t)preview_method; + params.preview_method = (sd_preview_t)preview_method; return 1; }; @@ -820,6 +820,7 @@ int main(int argc, const char* argv[]) { }}; sd_set_log_callback(sd_log_cb, (void*)¶ms); + sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval); if (params.verbose) { print_params(params); @@ -1044,7 +1045,7 @@ int main(int argc, const char* argv[]) { params.input_id_images_path.c_str(), }; - results = generate_image(sd_ctx, &img_gen_params, params.preview_method, params.preview_interval,(step_callback_t)step_callback); + results = generate_image(sd_ctx, &img_gen_params); expected_num_results = params.batch_count; } else if (params.mode == VID_GEN) { sd_vid_gen_params_t vid_gen_params = { @@ -1062,7 +1063,7 @@ int main(int argc, const char* argv[]) { params.augmentation_level, }; - results = generate_video(sd_ctx, &vid_gen_params, (step_callback_t)step_callback); + results = generate_video(sd_ctx, &vid_gen_params); expected_num_results = params.video_frames; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 6c19e6ce2..d40c0ee82 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -874,7 +874,7 @@ class StableDiffusionGGML { int step, struct ggml_tensor* latents, enum SDVersion version, - sd_preview_policy_t preview_mode, + sd_preview_t preview_mode, ggml_tensor* result, std::function step_callback) { const uint32_t channel = 3; @@ -985,10 +985,7 @@ class StableDiffusionGGML { int start_merge_step, SDCondition id_cond, std::vector ref_latents = {}, - ggml_tensor* denoise_mask = nullptr, - sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, - int preview_interval = 1, - std::function step_callback = nullptr) { + ggml_tensor* denoise_mask = nullptr) { std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); float cfg_scale = guidance.txt_cfg; @@ -1050,7 +1047,8 @@ class StableDiffusionGGML { struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* preview_tensor = NULL; - if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) { + auto sd_preview_mode = sd_get_preview_mode(); + if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) { preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, (denoised->ne[0] * 8), (denoised->ne[1] * 8), @@ -1232,10 +1230,11 @@ class StableDiffusionGGML { pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); } - - if (step_callback != nullptr) { - if (step % preview_interval == 0) { - preview_image(work_ctx, step, denoised, version, preview_mode, preview_tensor, step_callback); + auto sd_preview_cb = sd_get_preview_callback(); + auto sd_preview_mode = sd_get_preview_mode(); + if (sd_preview_cb != NULL) { + if (step % sd_get_preview_interval() == 0) { + preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb); } } return denoised; @@ -1687,10 +1686,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, std::string input_id_images_path, std::vector ref_latents, ggml_tensor* concat_latent = NULL, - ggml_tensor* denoise_mask = NULL, - sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, - int preview_interval = 1, - std::function step_callback = nullptr) { + ggml_tensor* denoise_mask = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1959,10 +1955,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, start_merge_step, id_cond, ref_latents, - denoise_mask, - preview_mode, - preview_interval, - step_callback); + denoise_mask); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -2036,7 +2029,7 @@ ggml_tensor* generate_init_latent(sd_ctx_t* sd_ctx, return init_latent; } -sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, sd_preview_policy_t preview_mode, int preview_interval, step_callback_t step_callback) { +sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params) { int width = sd_img_gen_params->width; int height = sd_img_gen_params->height; if (sd_version_is_dit(sd_ctx->sd->version)) { @@ -2064,7 +2057,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } - if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) { + auto sd_preview_mode = sd_get_preview_mode(); + if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) { params.mem_size *= 2; } params.mem_size += width * height * 3 * sizeof(float) * 3; @@ -2248,10 +2242,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g sd_img_gen_params->input_id_images_path, ref_latents, concat_latent, - denoise_mask, - preview_mode, - preview_interval, - step_callback); + denoise_mask); size_t t2 = ggml_time_ms(); @@ -2260,7 +2251,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g return result_images; } -SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, step_callback_t step_callback) { +SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params) { if (sd_ctx == NULL || sd_vid_gen_params == NULL) { return NULL; } @@ -2344,8 +2335,6 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s -1, SDCondition(NULL, NULL, NULL), {}, - NULL, - (sd_preview_policy_t)0, 1, NULL); int64_t t2 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index a043e1904..8e80d9314 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -111,7 +111,7 @@ enum sd_log_level_t { SD_LOG_ERROR }; -enum sd_preview_policy_t { +enum sd_preview_t { SD_PREVIEW_NONE, SD_PREVIEW_PROJ, SD_PREVIEW_TAE, @@ -216,11 +216,11 @@ typedef struct sd_ctx_t sd_ctx_t; typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); +typedef void (*sd_preview_cb_t)(int, sd_image_t); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); -SD_API sd_progress_cb_t sd_get_progress_callback(); -SD_API void* sd_get_progress_callback_data(); +SD_API void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode, int interval); SD_API int32_t get_num_physical_cores(); SD_API const char* sd_get_system_info(); @@ -239,14 +239,12 @@ SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); SD_API sd_ctx_t* new_sd_ctx(const sd_ctx_params_t* sd_ctx_params); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); -typedef void (*step_callback_t)(int, sd_image_t); - SD_API void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params); SD_API char* sd_img_gen_params_to_str(const sd_img_gen_params_t* sd_img_gen_params); -SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params, sd_preview_policy_t preview_mode, int preview_interval, step_callback_t step_callback); +SD_API sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_gen_params); SD_API void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params); -SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params, step_callback_t step_callback); // broken +SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* sd_vid_gen_params); // broken typedef struct upscaler_ctx_t upscaler_ctx_t; diff --git a/util.cpp b/util.cpp index 18caf8567..cac5b1408 100644 --- a/util.cpp +++ b/util.cpp @@ -247,6 +247,10 @@ int32_t get_num_physical_cores() { static sd_progress_cb_t sd_progress_cb = NULL; void* sd_progress_cb_data = NULL; +static sd_preview_cb_t sd_preview_cb = NULL; +sd_preview_t sd_preview_mode = SD_PREVIEW_NONE; +int sd_preview_interval = 1; + std::u32string utf8_to_utf32(const std::string& utf8_str) { std::wstring_convert, char32_t> converter; return converter.from_bytes(utf8_str); @@ -420,10 +424,27 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) { sd_progress_cb = cb; sd_progress_cb_data = data; } -sd_progress_cb_t sd_get_progress_callback(){ +void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode = SD_PREVIEW_PROJ, int interval = 1) { + sd_preview_cb = cb; + sd_preview_mode = mode; + sd_preview_interval = interval; +} + +sd_preview_cb_t sd_get_preview_callback() { + return sd_preview_cb; +} + +sd_preview_t sd_get_preview_mode() { + return sd_preview_mode; +} +int sd_get_preview_interval() { + return sd_preview_interval; +} + +sd_progress_cb_t sd_get_progress_callback() { return sd_progress_cb; } -void* sd_get_progress_callback_data(){ +void* sd_get_progress_callback_data() { return sd_progress_cb_data; } const char* sd_get_system_info() { diff --git a/util.h b/util.h index d98c9a280..bbcee8905 100644 --- a/util.h +++ b/util.h @@ -57,6 +57,13 @@ std::string trim(const std::string& s); std::vector> parse_prompt_attention(const std::string& text); +sd_progress_cb_t sd_get_progress_callback(); +void* sd_get_progress_callback_data(); + +sd_preview_cb_t sd_get_preview_callback(); +sd_preview_t sd_get_preview_mode(); +int sd_get_preview_interval(); + #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__) From d465a70efd10e72cd16f66c0d1f9d17b16db180d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 23 Jul 2025 17:08:12 +0200 Subject: [PATCH 20/45] preview: new API --- examples/cli/main.cpp | 12 ++++++------ stable-diffusion.cpp | 35 +++++++++++++++++++++++++++++------ stable-diffusion.h | 16 +++++++++------- util.cpp | 6 +++--- util.h | 2 +- 5 files changed, 48 insertions(+), 23 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 560565434..f63435b78 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -119,7 +119,7 @@ struct SDParams { bool chroma_use_t5_mask = false; int chroma_t5_mask_pad = 1; - sd_preview_t preview_method = SD_PREVIEW_NONE; + preview_t preview_method = PREVIEW_NONE; int preview_interval = 1; std::string preview_path = "preview.png"; bool taesd_preview = false; @@ -201,7 +201,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --t5xxl path to the t5xxl text encoder\n"); printf(" --vae [VAE] path to vae\n"); printf(" --taesd [TAESD] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); - printf(" --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[SD_PREVIEW_TAE]); + printf(" --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[PREVIEW_TAE]); printf(" --control-net [CONTROL_PATH] path to control net model\n"); printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n"); printf(" --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings\n"); @@ -257,8 +257,8 @@ void print_usage(int argc, const char* argv[]) { printf(" This might crash if it is not supported by the backend.\n"); printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); - printf(" --preview {%s,%s,%s,%s} preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[SD_PREVIEW_NONE]); - printf(" %s is the fastest\n", previews_str[SD_PREVIEW_PROJ]); + printf(" --preview {%s,%s,%s,%s} preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[PREVIEW_NONE]); + printf(" %s is the fastest\n", previews_str[PREVIEW_PROJ]); printf(" --preview-interval [N] How often to save the image preview"); printf(" --preview-path [PATH} path to write preview image to (default: ./preview.png)\n"); printf(" --color colors the logging tags according to level\n"); @@ -595,7 +595,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { } const char* preview = argv[index]; int preview_method = -1; - for (int m = 0; m < N_PREVIEWS; m++) { + for (int m = 0; m < PREVIEW_COUNT; m++) { if (!strcmp(preview, previews_str[m])) { preview_method = m; } @@ -605,7 +605,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { preview); return -1; } - params.preview_method = (sd_preview_t)preview_method; + params.preview_method = (preview_t)preview_method; return 1; }; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index d40c0ee82..6a1567735 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -874,14 +874,14 @@ class StableDiffusionGGML { int step, struct ggml_tensor* latents, enum SDVersion version, - sd_preview_t preview_mode, + preview_t preview_mode, ggml_tensor* result, std::function step_callback) { const uint32_t channel = 3; uint32_t width = latents->ne[0]; uint32_t height = latents->ne[1]; uint32_t dim = latents->ne[2]; - if (preview_mode == SD_PREVIEW_PROJ) { + if (preview_mode == PREVIEW_PROJ) { const float (*latent_rgb_proj)[channel]; if (dim == 16) { @@ -924,7 +924,7 @@ class StableDiffusionGGML { step_callback(step, image); free(image.data); } else { - if (preview_mode == SD_PREVIEW_VAE) { + if (preview_mode == PREVIEW_VAE) { ggml_tensor_scale(latents, 1.0f / scale_factor); if (vae_tiling) { // split latent in 32x32 tiles and compute in several steps @@ -940,7 +940,7 @@ class StableDiffusionGGML { ggml_tensor_scale(latents, scale_factor); ggml_tensor_scale_output(result); - } else if (preview_mode == SD_PREVIEW_TAE) { + } else if (preview_mode == PREVIEW_TAE) { if (tae_first_stage == nullptr) { LOG_WARN("TAE not found for preview"); return; @@ -1048,7 +1048,7 @@ class StableDiffusionGGML { struct ggml_tensor* preview_tensor = NULL; auto sd_preview_mode = sd_get_preview_mode(); - if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) { + if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) { preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, (denoised->ne[0] * 8), (denoised->ne[1] * 8), @@ -1447,6 +1447,29 @@ enum schedule_t str_to_schedule(const char* str) { return SCHEDULE_COUNT; } +const char* preview_to_str[] = { + "none", + "proj", + "tae", + "vae", +}; + +const char* sd_preview_name(enum preview_t preview) { + if (preview < PREVIEW_COUNT) { + return preview_to_str[preview]; + } + return NONE_STR; +} + +enum preview_t str_to_preview(const char* str) { + for (int i = 0; i < PREVIEW_COUNT; i++) { + if (!strcmp(str, preview_to_str[i])) { + return (enum preview_t)i; + } + } + return PREVIEW_COUNT; +} + void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { memset((void*)sd_ctx_params, 0, sizeof(sd_ctx_params_t)); sd_ctx_params->vae_decode_only = true; @@ -2058,7 +2081,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } auto sd_preview_mode = sd_get_preview_mode(); - if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) { + if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) { params.mem_size *= 2; } params.mem_size += width * height * 3 * sizeof(float) * 3; diff --git a/stable-diffusion.h b/stable-diffusion.h index 8e80d9314..f2be3dfef 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -111,12 +111,12 @@ enum sd_log_level_t { SD_LOG_ERROR }; -enum sd_preview_t { - SD_PREVIEW_NONE, - SD_PREVIEW_PROJ, - SD_PREVIEW_TAE, - SD_PREVIEW_VAE, - N_PREVIEWS +enum preview_t { + PREVIEW_NONE, + PREVIEW_PROJ, + PREVIEW_TAE, + PREVIEW_VAE, + PREVIEW_COUNT }; typedef struct { @@ -220,7 +220,7 @@ typedef void (*sd_preview_cb_t)(int, sd_image_t); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); -SD_API void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode, int interval); +SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval); SD_API int32_t get_num_physical_cores(); SD_API const char* sd_get_system_info(); @@ -232,6 +232,8 @@ SD_API const char* sd_sample_method_name(enum sample_method_t sample_method); SD_API enum sample_method_t str_to_sample_method(const char* str); SD_API const char* sd_schedule_name(enum schedule_t schedule); SD_API enum schedule_t str_to_schedule(const char* str); +SD_API const char* sd_preview_name(enum preview_t preview); +SD_API enum preview_t str_to_preview(const char* str); SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); diff --git a/util.cpp b/util.cpp index cac5b1408..741b77cc5 100644 --- a/util.cpp +++ b/util.cpp @@ -248,7 +248,7 @@ static sd_progress_cb_t sd_progress_cb = NULL; void* sd_progress_cb_data = NULL; static sd_preview_cb_t sd_preview_cb = NULL; -sd_preview_t sd_preview_mode = SD_PREVIEW_NONE; +preview_t sd_preview_mode = PREVIEW_NONE; int sd_preview_interval = 1; std::u32string utf8_to_utf32(const std::string& utf8_str) { @@ -424,7 +424,7 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) { sd_progress_cb = cb; sd_progress_cb_data = data; } -void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode = SD_PREVIEW_PROJ, int interval = 1) { +void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1) { sd_preview_cb = cb; sd_preview_mode = mode; sd_preview_interval = interval; @@ -434,7 +434,7 @@ sd_preview_cb_t sd_get_preview_callback() { return sd_preview_cb; } -sd_preview_t sd_get_preview_mode() { +preview_t sd_get_preview_mode() { return sd_preview_mode; } int sd_get_preview_interval() { diff --git a/util.h b/util.h index bbcee8905..ac32cd080 100644 --- a/util.h +++ b/util.h @@ -61,7 +61,7 @@ sd_progress_cb_t sd_get_progress_callback(); void* sd_get_progress_callback_data(); sd_preview_cb_t sd_get_preview_callback(); -sd_preview_t sd_get_preview_mode(); +preview_t sd_get_preview_mode(); int sd_get_preview_interval(); #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) From 55ef7beb09b0dd7371a82a2cf0605aba2b77bbc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 31 Aug 2025 02:08:30 +0200 Subject: [PATCH 21/45] latent proj bias --- latent-preview.h | 11 +++++++++-- stable-diffusion.cpp | 7 ++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/latent-preview.h b/latent-preview.h index ca4d132f3..d21700108 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -1,4 +1,3 @@ - // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 const float flux_latent_rgb_proj[16][3] = { {-0.0346f, 0.0244f, 0.0681f}, @@ -17,6 +16,7 @@ const float flux_latent_rgb_proj[16][3] = { {-0.1264f, -0.0522f, -0.1103f}, {-0.0280f, -0.0881f, -0.0499f}, {-0.1262f, -0.0982f, -0.0778f}}; +float flux_latent_rgb_bias[3] = {-0.0329, -0.0718, -0.0851}; // https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 const float sd3_latent_rgb_proj[16][3] = { @@ -37,6 +37,7 @@ const float sd3_latent_rgb_proj[16][3] = { {-0.0749f, -0.0634f, -0.0456f}, {-0.1418f, -0.1457f, -0.1259f}, }; +float sd3_latent_rgb_bias[3] = {0, 0, 0}; // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 const float sdxl_latent_rgb_proj[4][3] = { @@ -44,6 +45,7 @@ const float sdxl_latent_rgb_proj[4][3] = { {-0.2533f, -0.0042f, 0.1068f}, {0.1076f, 0.1111f, -0.0362f}, {-0.3165f, -0.2492f, -0.2188f}}; +float sdxl_latent_rgb_bias[3] = {0.1084, -0.0175, -0.0011}; // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 const float sd_latent_rgb_proj[4][3]{ @@ -51,8 +53,9 @@ const float sd_latent_rgb_proj[4][3]{ {0.3250f, 0.4974f, 0.2350f}, {-0.2829f, 0.1762f, 0.2721f}, {-0.2120f, -0.2616f, -0.7177f}}; +float sd_latent_rgb_bias[3] = {0,0,0}; -void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) { +void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int dim) { size_t buffer_head = 0; for (int j = 0; j < height; j++) { for (int i = 0; i < width; i++) { @@ -64,6 +67,10 @@ void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const fl g += value * latent_rgb_proj[d][1]; b += value * latent_rgb_proj[d][2]; } + // bias + r += latent_rgb_bias[0]; + g += latent_rgb_bias[1]; + b += latent_rgb_bias[2]; // change range r = r * .5f + .5f; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 6a1567735..886c7428f 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -883,14 +883,17 @@ class StableDiffusionGGML { uint32_t dim = latents->ne[2]; if (preview_mode == PREVIEW_PROJ) { const float (*latent_rgb_proj)[channel]; + float *latent_rgb_bias; if (dim == 16) { // 16 channels VAE -> Flux or SD3 if (sd_version_is_sd3(version)) { latent_rgb_proj = sd3_latent_rgb_proj; + latent_rgb_bias = sd3_latent_rgb_bias; } else if (sd_version_is_flux(version)) { latent_rgb_proj = flux_latent_rgb_proj; + latent_rgb_bias = flux_latent_rgb_bias; } else { LOG_WARN("No latent to RGB projection known for this model"); // unknown model @@ -901,8 +904,10 @@ class StableDiffusionGGML { // 4 channels VAE if (sd_version_is_sdxl(version)) { latent_rgb_proj = sdxl_latent_rgb_proj; + latent_rgb_bias = sdxl_latent_rgb_bias; } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { latent_rgb_proj = sd_latent_rgb_proj; + latent_rgb_bias = sd_latent_rgb_bias; } else { // unknown model LOG_WARN("No latent to RGB projection known for this model"); @@ -915,7 +920,7 @@ class StableDiffusionGGML { } uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); - preview_latent_image(data, latents, latent_rgb_proj, width, height, dim); + preview_latent_image(data, latents, latent_rgb_proj,latent_rgb_bias, width, height, dim); sd_image_t image = { width, height, From a5278ceec350216d4ab60ee3582636821d27a4ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 9 Sep 2025 13:15:27 +0200 Subject: [PATCH 22/45] fix merge issues --- stable-diffusion.cpp | 64 ++++++-------------------------------------- 1 file changed, 8 insertions(+), 56 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index fb6a1a251..3d9b5a653 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -729,6 +729,7 @@ class StableDiffusionGGML { LOG_DEBUG("finished loaded file"); ggml_free(ctx); + use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only; return true; } @@ -763,51 +764,6 @@ class StableDiffusionGGML { LOG_ERROR("Unknown scheduler %i", scheduler); abort(); } - if (sd_ctx_params->schedule != DEFAULT) { - switch (sd_ctx_params->schedule) { - case DISCRETE: - LOG_INFO("running with discrete schedule"); - denoiser->schedule = std::make_shared(); - break; - case KARRAS: - LOG_INFO("running with Karras schedule"); - denoiser->schedule = std::make_shared(); - break; - case EXPONENTIAL: - LOG_INFO("running exponential schedule"); - denoiser->schedule = std::make_shared(); - break; - case AYS: - LOG_INFO("Running with Align-Your-Steps schedule"); - denoiser->schedule = std::make_shared(); - denoiser->schedule->version = version; - break; - case GITS: - LOG_INFO("Running with GITS schedule"); - denoiser->schedule = std::make_shared(); - denoiser->schedule->version = version; - break; - case DEFAULT: - // Don't touch anything. - break; - default: - LOG_ERROR("Unknown schedule %i", sd_ctx_params->schedule); - abort(); - } - } - - auto comp_vis_denoiser = std::dynamic_pointer_cast(denoiser); - if (comp_vis_denoiser) { - for (int i = 0; i < TIMESTEPS; i++) { - comp_vis_denoiser->sigmas[i] = std::sqrt((1 - ((float*)alphas_cumprod_tensor->data)[i]) / ((float*)alphas_cumprod_tensor->data)[i]); - comp_vis_denoiser->log_sigmas[i] = std::log(comp_vis_denoiser->sigmas[i]); - } - } - - LOG_DEBUG("finished loaded file"); - ggml_free(ctx); - use_tiny_autoencoder = use_tiny_autoencoder && !sd_ctx_params->tae_preview_only; - return true; } bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) { @@ -1134,21 +1090,19 @@ class StableDiffusionGGML { free(image.data); } else { if (preview_mode == PREVIEW_VAE) { - ggml_tensor_scale(latents, 1.0f / scale_factor); + process_latent_out(latents); if (vae_tiling) { // split latent in 32x32 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - first_stage_model->compute(n_threads, in, true, &out); + first_stage_model->compute(n_threads, in, true, &out, NULL); }; silent_tiling(latents, result, 8, 32, 0.5f, on_tiling); } else { - first_stage_model->compute(n_threads, latents, true, &result); + first_stage_model->compute(n_threads, latents, true, &result, work_ctx); } first_stage_model->free_compute_buffer(); - ggml_tensor_scale(latents, scale_factor); - - ggml_tensor_scale_output(result); + process_vae_output_tensor(result); } else if (preview_mode == PREVIEW_TAE) { if (tae_first_stage == nullptr) { LOG_WARN("TAE not found for preview"); @@ -1157,11 +1111,11 @@ class StableDiffusionGGML { if (vae_tiling) { // split latent in 64x64 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - tae_first_stage->compute(n_threads, in, true, &out); + tae_first_stage->compute(n_threads, in, true, &out, NULL); }; silent_tiling(latents, result, 8, 64, 0.5f, on_tiling); } else { - tae_first_stage->compute(n_threads, latents, true, &result); + tae_first_stage->compute(n_threads, latents, true, &result, work_ctx); } tae_first_stage->free_compute_buffer(); } else { @@ -2884,9 +2838,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s -1, {}, {}, - denoise_mask, - {}, - NULL); + denoise_mask); int64_t sampling_end = ggml_time_ms(); LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); From 030aa3df2e4eacb86e2ed0f3e2342847f4d3244b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 30 Aug 2025 18:27:17 +0200 Subject: [PATCH 23/45] add wan latent projs --- latent-preview.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/latent-preview.h b/latent-preview.h index d21700108..c9994a2bd 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -1,3 +1,71 @@ +const float wan_21_latent_rgb_proj[16][3] = { + {-0.1299, -0.1692, 0.2932}, + {0.0671, 0.0406, 0.0442}, + {0.3568, 0.2548, 0.1747}, + {0.0372, 0.2344, 0.1420}, + {0.0313, 0.0189, -0.0328}, + {0.0296, -0.0956, -0.0665}, + {-0.3477, -0.4059, -0.2925}, + {0.0166, 0.1902, 0.1975}, + {-0.0412, 0.0267, -0.1364}, + {-0.1293, 0.0740, 0.1636}, + {0.0680, 0.3019, 0.1128}, + {0.0032, 0.0581, 0.0639}, + {-0.1251, 0.0927, 0.1699}, + {0.0060, -0.0633, 0.0005}, + {0.3477, 0.2275, 0.2950}, + {0.1984, 0.0913, 0.1861}}; + +const float wan_22_latent_rgb_proj[48][3] = { + {0.0119, 0.0103, 0.0046}, + {-0.1062, -0.0504, 0.0165}, + {0.0140, 0.0409, 0.0491}, + {-0.0813, -0.0677, 0.0607}, + {0.0656, 0.0851, 0.0808}, + {0.0264, 0.0463, 0.0912}, + {0.0295, 0.0326, 0.0590}, + {-0.0244, -0.0270, 0.0025}, + {0.0443, -0.0102, 0.0288}, + {-0.0465, -0.0090, -0.0205}, + {0.0359, 0.0236, 0.0082}, + {-0.0776, 0.0854, 0.1048}, + {0.0564, 0.0264, 0.0561}, + {0.0006, 0.0594, 0.0418}, + {-0.0319, -0.0542, -0.0637}, + {-0.0268, 0.0024, 0.0260}, + {0.0539, 0.0265, 0.0358}, + {-0.0359, -0.0312, -0.0287}, + {-0.0285, -0.1032, -0.1237}, + {0.1041, 0.0537, 0.0622}, + {-0.0086, -0.0374, -0.0051}, + {0.0390, 0.0670, 0.2863}, + {0.0069, 0.0144, 0.0082}, + {0.0006, -0.0167, 0.0079}, + {0.0313, -0.0574, -0.0232}, + {-0.1454, -0.0902, -0.0481}, + {0.0714, 0.0827, 0.0447}, + {-0.0304, -0.0574, -0.0196}, + {0.0401, 0.0384, 0.0204}, + {-0.0758, -0.0297, -0.0014}, + {0.0568, 0.1307, 0.1372}, + {-0.0055, -0.0310, -0.0380}, + {0.0239, -0.0305, 0.0325}, + {-0.0663, -0.0673, -0.0140}, + {-0.0416, -0.0047, -0.0023}, + {0.0166, 0.0112, -0.0093}, + {-0.0211, 0.0011, 0.0331}, + {0.1833, 0.1466, 0.2250}, + {-0.0368, 0.0370, 0.0295}, + {-0.3441, -0.3543, -0.2008}, + {-0.0479, -0.0489, -0.0420}, + {-0.0660, -0.0153, 0.0800}, + {-0.0101, 0.0068, 0.0156}, + {-0.0690, -0.0452, -0.0927}, + {-0.0145, 0.0041, 0.0015}, + {0.0421, 0.0451, 0.0373}, + {0.0504, -0.0483, -0.0356}, + {-0.0837, 0.0168, 0.0055}}; + // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 const float flux_latent_rgb_proj[16][3] = { {-0.0346f, 0.0244f, 0.0681f}, @@ -87,4 +155,4 @@ void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const fl buffer[buffer_head++] = (uint8_t)(b * 255); } } -} \ No newline at end of file +} From 4c536b5fd3cef01a8f39ab7a1a878b741a78acd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 31 Aug 2025 01:33:58 +0200 Subject: [PATCH 24/45] animated previews --- examples/cli/main.cpp | 37 ++++++++--- latent-preview.h | 46 ++++++------- stable-diffusion.cpp | 147 ++++++++++++++++++++++++++++-------------- stable-diffusion.h | 2 +- 4 files changed, 154 insertions(+), 78 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 25d8a578b..75a4e1d26 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -127,9 +127,9 @@ struct SDParams { float flow_shift = INFINITY; preview_t preview_method = PREVIEW_NONE; - int preview_interval = 1; - std::string preview_path = "preview.png"; - bool taesd_preview = false; + int preview_interval = 1; + std::string preview_path = "preview.png"; + bool taesd_preview = false; SDParams() { sd_sample_params_init(&sample_params); @@ -298,7 +298,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --preview {%s,%s,%s,%s} preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[PREVIEW_NONE]); printf(" %s is the fastest\n", previews_str[PREVIEW_PROJ]); printf(" --preview-interval [N] How often to save the image preview"); - printf(" --preview-path [PATH} path to write preview image to (default: ./preview.png)\n"); + printf(" --preview-path [PATH] path to write preview image to (default: ./preview.png)\n"); printf(" --color colors the logging tags according to level\n"); printf(" --chroma-disable-dit-mask disable dit mask for chroma\n"); printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n"); @@ -506,7 +506,6 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"-p", "--prompt", "", ¶ms.prompt}, {"-n", "--negative-prompt", "", ¶ms.negative_prompt}, {"", "--preview-path", "", ¶ms.preview_path}, - {"", "--upscale-model", "", ¶ms.esrgan_path}, }; @@ -762,7 +761,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { } if (preview_method == -1) { fprintf(stderr, "error: preview method %s\n", - preview); + preview); return -1; } params.preview_method = (preview_t)preview_method; @@ -1065,15 +1064,37 @@ uint8_t* load_image(const char* image_path, int& width, int& height, int expecte } const char* preview_path; +float preview_fps; -void step_callback(int step, sd_image_t image) { - stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0); +void step_callback(int step, int frame_count, sd_image_t* image) { + if (frame_count == 1) { + stbi_write_png(preview_path, image->width, image->height, image->channel, image->data, 0); + } else { + create_mjpg_avi_from_sd_images(preview_path, image, frame_count, preview_fps); + } } int main(int argc, const char* argv[]) { SDParams params; parse_args(argc, argv, params); preview_path = params.preview_path.c_str(); + if (params.video_frames > 4) { + size_t last_dot_pos = params.preview_path.find_last_of("."); + std::string base_path = params.preview_path; + std::string file_ext = ""; + if (last_dot_pos != std::string::npos) { // filename has extension + base_path = params.preview_path.substr(0, last_dot_pos); + file_ext = params.preview_path.substr(last_dot_pos); + std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower); + } + if (file_ext == ".png") { + preview_path = (base_path + ".avi").c_str(); + } + } + preview_fps = params.fps; + if (params.preview_method == PREVIEW_PROJ) + preview_fps /= 4.0f; + params.sample_params.guidance.slg.layers = params.skip_layers.data(); params.sample_params.guidance.slg.layer_count = params.skip_layers.size(); params.high_noise_sample_params.guidance.slg.layers = params.high_noise_skip_layers.data(); diff --git a/latent-preview.h b/latent-preview.h index c9994a2bd..97be36e0a 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -123,36 +123,38 @@ const float sd_latent_rgb_proj[4][3]{ {-0.2120f, -0.2616f, -0.7177f}}; float sd_latent_rgb_bias[3] = {0,0,0}; -void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int dim) { +void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) { size_t buffer_head = 0; - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - size_t latent_id = (i * latents->nb[0] + j * latents->nb[1]); - float r = 0, g = 0, b = 0; - for (int d = 0; d < dim; d++) { - float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]); - r += value * latent_rgb_proj[d][0]; - g += value * latent_rgb_proj[d][1]; - b += value * latent_rgb_proj[d][2]; - } + for (int k = 0; k < frames; k++) { + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]); + float r = 0, g = 0, b = 0; + for (int d = 0; d < dim; d++) { + float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]); + r += value * latent_rgb_proj[d][0]; + g += value * latent_rgb_proj[d][1]; + b += value * latent_rgb_proj[d][2]; + } // bias r += latent_rgb_bias[0]; g += latent_rgb_bias[1]; b += latent_rgb_bias[2]; - // change range - r = r * .5f + .5f; - g = g * .5f + .5f; - b = b * .5f + .5f; + // change range + r = r * .5f + .5f; + g = g * .5f + .5f; + b = b * .5f + .5f; - // clamp rgb values to [0,1] range - r = r >= 0 ? r <= 1 ? r : 1 : 0; - g = g >= 0 ? g <= 1 ? g : 1 : 0; - b = b >= 0 ? b <= 1 ? b : 1 : 0; + // clamp rgb values to [0,1] range + r = r >= 0 ? r <= 1 ? r : 1 : 0; + g = g >= 0 ? g <= 1 ? g : 1 : 0; + b = b >= 0 ? b <= 1 ? b : 1 : 0; - buffer[buffer_head++] = (uint8_t)(r * 255); - buffer[buffer_head++] = (uint8_t)(g * 255); - buffer[buffer_head++] = (uint8_t)(b * 255); + buffer[buffer_head++] = (uint8_t)(r * 255); + buffer[buffer_head++] = (uint8_t)(g * 255); + buffer[buffer_head++] = (uint8_t)(b * 255); + } } } } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 3d9b5a653..b69ddea87 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -361,8 +361,8 @@ class StableDiffusionGGML { offload_params_to_cpu, model_loader.tensor_storages_types); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - model_loader.tensor_storages_types); + offload_params_to_cpu, + model_loader.tensor_storages_types); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : model_loader.tensor_storages_types) { @@ -398,11 +398,11 @@ class StableDiffusionGGML { 1, true); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - model_loader.tensor_storages_types, - "model.diffusion_model", - version, - sd_ctx_params->diffusion_flash_attn); + offload_params_to_cpu, + model_loader.tensor_storages_types, + "model.diffusion_model", + version, + sd_ctx_params->diffusion_flash_attn); if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) { high_noise_diffusion_model = std::make_shared(backend, offload_params_to_cpu, @@ -1036,16 +1036,25 @@ class StableDiffusionGGML { enum SDVersion version, preview_t preview_mode, ggml_tensor* result, - std::function step_callback) { + std::function step_callback) { const uint32_t channel = 3; uint32_t width = latents->ne[0]; uint32_t height = latents->ne[1]; - uint32_t dim = latents->ne[2]; + uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; + if (preview_mode == PREVIEW_PROJ) { const float (*latent_rgb_proj)[channel]; float *latent_rgb_bias; - if (dim == 16) { + if (dim == 48) { + if (sd_version_is_wan(version)) { + latent_rgb_proj = wan_22_latent_rgb_proj; + } else { + LOG_WARN("No latent to RGB projection known for this model"); + // unknown model + return; + } + } else if (dim == 16) { // 16 channels VAE -> Flux or SD3 if (sd_version_is_sd3(version)) { @@ -1053,6 +1062,8 @@ class StableDiffusionGGML { latent_rgb_bias = sd3_latent_rgb_bias; } else if (sd_version_is_flux(version)) { latent_rgb_proj = flux_latent_rgb_proj; + } else if (sd_version_is_wan(version)) { + latent_rgb_proj = wan_21_latent_rgb_proj; latent_rgb_bias = flux_latent_rgb_bias; } else { LOG_WARN("No latent to RGB projection known for this model"); @@ -1078,16 +1089,22 @@ class StableDiffusionGGML { // unknown latent space return; } - uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); - preview_latent_image(data, latents, latent_rgb_proj,latent_rgb_bias, width, height, dim); - sd_image_t image = { - width, - height, - channel, - data}; - step_callback(step, image); - free(image.data); + uint32_t frames = 1; + if (ggml_n_dims(latents) == 4) { + frames = latents->ne[2]; + } + + uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t)); + + preview_latent_video(data, latents, latent_rgb_proj,latent_rgb_bias, width, height, frames, dim); + sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t)); + for (int i = 0; i < frames; i++) { + images[i] = {width, height, channel, data + i * width * height * channel}; + } + step_callback(step, frames, images); + free(data); + free(images); } else { if (preview_mode == PREVIEW_VAE) { process_latent_out(latents); @@ -1101,8 +1118,10 @@ class StableDiffusionGGML { } else { first_stage_model->compute(n_threads, latents, true, &result, work_ctx); } + first_stage_model->free_compute_buffer(); process_vae_output_tensor(result); + process_latent_in(latents); } else if (preview_mode == PREVIEW_TAE) { if (tae_first_stage == nullptr) { LOG_WARN("TAE not found for preview"); @@ -1121,15 +1140,30 @@ class StableDiffusionGGML { } else { return; } + ggml_tensor_clamp(result, 0.0f, 1.0f); - sd_image_t image = { - width * 8, - height * 8, - channel, - sd_tensor_to_image(result)}; + uint32_t frames = 1; + if (ggml_n_dims(latents) == 4) { + frames = result->ne[2]; + } + + sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t)); + print_ggml_tensor(result,true); + for (size_t i = 0; i < frames; i++) { + images[i].width = result->ne[0]; + images[i].height = result->ne[1]; + images[i].channel = 3; + images[i].data = sd_tensor_to_image(result, i, ggml_n_dims(latents) == 4); + } + + step_callback(step, frames, images); + ggml_tensor_scale(result, 0); - step_callback(step, image); - free(image.data); + for (int i = 0; i < frames; i++) { + free(images[i].data); + } + + free(images); } } @@ -1200,13 +1234,32 @@ class StableDiffusionGGML { struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* preview_tensor = NULL; - auto sd_preview_mode = sd_get_preview_mode(); + auto sd_preview_mode = sd_get_preview_mode(); if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) { - preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, - (denoised->ne[0] * 8), - (denoised->ne[1] * 8), - 3, - denoised->ne[3]); + int64_t W = x->ne[0] * 8; + int64_t H = x->ne[1] * 8; + if (ggml_n_dims(x) == 4) { + // assuming video mode (if batch processing gets implemented this will break) + int T = x->ne[2]; + if (sd_version_is_wan(version)) { + T = ((T - 1) * 4) + 1; + if (version == VERSION_WAN2_2_TI2V) { + W = x->ne[0] * 16; + H = x->ne[1] * 16; + } + } + preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, + W, + H, + T, + 3); + } else { + preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, + W, + H, + 3, + x->ne[3]); + } } auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { @@ -1378,7 +1431,7 @@ class StableDiffusionGGML { pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); } - auto sd_preview_cb = sd_get_preview_callback(); + auto sd_preview_cb = sd_get_preview_callback(); auto sd_preview_mode = sd_get_preview_mode(); if (sd_preview_cb != NULL) { if (step % sd_get_preview_interval() == 0) { @@ -1465,12 +1518,12 @@ class StableDiffusionGGML { -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}; latents_std_vec = { - 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, - 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, - 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, - 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, - 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, - 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; + 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, + 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, + 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, + 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, + 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, + 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; } for (int i = 0; i < latent->ne[3]; i++) { float mean = latents_mean_vec[i]; @@ -1505,12 +1558,12 @@ class StableDiffusionGGML { -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}; latents_std_vec = { - 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, - 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, - 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, - 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, - 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, - 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; + 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, + 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, + 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, + 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, + 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, + 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; } for (int i = 0; i < latent->ne[3]; i++) { float mean = latents_mean_vec[i]; @@ -1967,8 +2020,8 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, std::string input_id_images_path, std::vector ref_latents, bool increase_ref_index, - ggml_tensor* concat_latent = NULL, - ggml_tensor* denoise_mask = NULL) { + ggml_tensor* concat_latent = NULL, + ggml_tensor* denoise_mask = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library diff --git a/stable-diffusion.h b/stable-diffusion.h index bd39fc018..779590b60 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -226,7 +226,7 @@ typedef struct sd_ctx_t sd_ctx_t; typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); -typedef void (*sd_preview_cb_t)(int, sd_image_t); +typedef void (*sd_preview_cb_t)(int, int, sd_image_t*); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); From 7a0ab287db32d897e7941673bf444114204ab9fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 31 Aug 2025 02:08:30 +0200 Subject: [PATCH 25/45] latent proj bias --- latent-preview.h | 2 ++ stable-diffusion.cpp | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/latent-preview.h b/latent-preview.h index 97be36e0a..5c1606053 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -15,6 +15,7 @@ const float wan_21_latent_rgb_proj[16][3] = { {0.0060, -0.0633, 0.0005}, {0.3477, 0.2275, 0.2950}, {0.1984, 0.0913, 0.1861}}; +float wan_21_latent_rgb_bias[3] = {-0.1223, -0.1889, -0.1976}; const float wan_22_latent_rgb_proj[48][3] = { {0.0119, 0.0103, 0.0046}, @@ -65,6 +66,7 @@ const float wan_22_latent_rgb_proj[48][3] = { {0.0421, 0.0451, 0.0373}, {0.0504, -0.0483, -0.0356}, {-0.0837, 0.0168, 0.0055}}; +float wan_22_latent_rgb_bias[3] = {0.0317, -0.0878, -0.1388}; // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 const float flux_latent_rgb_proj[16][3] = { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index b69ddea87..42744cccc 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1045,10 +1045,12 @@ class StableDiffusionGGML { if (preview_mode == PREVIEW_PROJ) { const float (*latent_rgb_proj)[channel]; float *latent_rgb_bias; + float *latent_rgb_bias; if (dim == 48) { if (sd_version_is_wan(version)) { latent_rgb_proj = wan_22_latent_rgb_proj; + latent_rgb_bias = wan_22_latent_rgb_bias; } else { LOG_WARN("No latent to RGB projection known for this model"); // unknown model @@ -1062,9 +1064,10 @@ class StableDiffusionGGML { latent_rgb_bias = sd3_latent_rgb_bias; } else if (sd_version_is_flux(version)) { latent_rgb_proj = flux_latent_rgb_proj; + latent_rgb_bias = flux_latent_rgb_bias; } else if (sd_version_is_wan(version)) { latent_rgb_proj = wan_21_latent_rgb_proj; - latent_rgb_bias = flux_latent_rgb_bias; + latent_rgb_bias = wan_21_latent_rgb_bias; } else { LOG_WARN("No latent to RGB projection known for this model"); // unknown model From 3e0ef2796485378145c9846e666bcd8bfab77eda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 9 Sep 2025 13:20:41 +0200 Subject: [PATCH 26/45] fix dup --- stable-diffusion.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 42744cccc..716269930 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1045,7 +1045,6 @@ class StableDiffusionGGML { if (preview_mode == PREVIEW_PROJ) { const float (*latent_rgb_proj)[channel]; float *latent_rgb_bias; - float *latent_rgb_bias; if (dim == 48) { if (sd_version_is_wan(version)) { From 70a16116d80fc44b5506306d37a2388be3a82820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 15 Oct 2025 10:14:04 +0200 Subject: [PATCH 27/45] Support latent2rgb preview for qwen image (via wan21) --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 4646c2cf4..5fee56c2c 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1136,7 +1136,7 @@ class StableDiffusionGGML { } else if (sd_version_is_flux(version)) { latent_rgb_proj = flux_latent_rgb_proj; latent_rgb_bias = flux_latent_rgb_bias; - } else if (sd_version_is_wan(version)) { + } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { latent_rgb_proj = wan_21_latent_rgb_proj; latent_rgb_bias = wan_21_latent_rgb_bias; } else { From e2ce17dd618e96333c0d13cae136f442bddf7a94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 15 Oct 2025 14:40:17 +0200 Subject: [PATCH 28/45] Fix ctx memory pool size overwritten during merge --- stable-diffusion.cpp | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 5fee56c2c..a0349dda8 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -2638,23 +2638,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g } struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - if (sd_version_is_sd3(sd_ctx->sd->version)) { - params.mem_size *= 3; - } - if (sd_version_is_flux(sd_ctx->sd->version)) { - params.mem_size *= 4; - } - if (sd_ctx->sd->stacked_id) { - params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB - } - auto sd_preview_mode = sd_get_preview_mode(); - if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) { - params.mem_size *= 2; - } - params.mem_size += width * height * 3 * sizeof(float) * 3; - params.mem_size += width * height * 3 * sizeof(float) * 3 * sd_img_gen_params->ref_images_count; - params.mem_size *= sd_img_gen_params->batch_count; + params.mem_size = static_cast(1024 * 1024) * 1024; // 1G params.mem_buffer = NULL; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); From f7b53e50456ad8b7ac5eda3072833008b0a77d79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 19 Oct 2025 17:20:45 +0200 Subject: [PATCH 29/45] fix build and update help messages --- examples/cli/main.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 1c239e28d..2aeed946e 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -668,7 +668,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { ¶ms.sample_params.shifted_timestep}, {"", "--preview-interval", - "How often to save the image preview", + "interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)", ¶ms.preview_interval}, }; @@ -826,7 +826,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { false, ¶ms.auto_resize_ref_image}, {"", "--taesd-preview-only", - "prevents usage of taesd for decoding the final image. (for use with --preview " + previews_str[PREVIEW_TAE] + ")", + std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")", false, ¶ms.taesd_preview}, }; @@ -1157,7 +1157,10 @@ void parse_args(int argc, const char** argv, SDParams& params) { "--vae-relative-tile-size", "relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)", on_relative_tile_size_arg}, - {"", "--preview", "preview method. must be one of " previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "(default is " + previews_str[PREVIEW_NONE] + "(disabled))\n", on_preview_arg}, + {"", + "--preview", + std::string("preview method. must be one of the following [") + previews_str[0] + ", " + previews_str[1] + ", " + previews_str[2] + ", " + previews_str[3] + "] (default is " + previews_str[PREVIEW_NONE] + ")\n", + on_preview_arg}, }; if (!parse_options(argc, argv, options)) { From 0a59f36b8c4e5af8d0683390736778f78996a463 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 19 Oct 2025 17:23:07 +0200 Subject: [PATCH 30/45] update help message in readme --- examples/cli/README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/cli/README.md b/examples/cli/README.md index ee17d17da..abbb0b6f8 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -32,6 +32,7 @@ Options: -o, --output path to write result image to (default: ./output.png) -p, --prompt the prompt to render -n, --negative-prompt the negative prompt (default: "") + --preview-path path to write preview image to (default: ./preview.png) --upscale-model path to esrgan model. -t, --threads number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of CPU physical cores @@ -48,6 +49,8 @@ Options: --fps fps (default: 24) --timestep-shift shift timestep for NitroFusion models (default: 0). recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant + --preview-interval interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at + every step) --cfg-scale unconditional guidance scale: (default: 7.0) --img-cfg-scale image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) --guidance distilled guidance scale for models with guidance input (default: 3.5) @@ -86,6 +89,7 @@ Options: --chroma-enable-t5-mask enable t5 mask for chroma --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --disable-auto-resize-ref-image disable auto resize of ref images + --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae) -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the type of the weight file @@ -107,4 +111,5 @@ Options: --vae-tile-size tile size for vae tiling, format [X]x[Y] (default: 32x32) --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size) + --preview preview method. must be one of the following [none, proj, tae, vae] (default is none) ``` \ No newline at end of file From 059f025c5763ba2c47209cabe07c58056425197e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 23 Oct 2025 01:46:06 +0200 Subject: [PATCH 31/45] remove tensor shape spam --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ba2f0d7fe..a30941377 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1251,7 +1251,7 @@ class StableDiffusionGGML { } sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t)); - print_ggml_tensor(result,true); + // print_ggml_tensor(result,true); for (size_t i = 0; i < frames; i++) { images[i].width = result->ne[0]; images[i].height = result->ne[1]; From 6563d46cf1eca74c28d47aaf381d7fe46860a35a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 23 Oct 2025 01:51:15 +0200 Subject: [PATCH 32/45] Fix progress display --- stable-diffusion.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index a30941377..5fb32d0f9 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1539,10 +1539,6 @@ class StableDiffusionGGML { if (denoise_mask != nullptr) { apply_mask(denoised, init_latent, denoise_mask); } - if (step > 0) { - pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); - // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); - } auto sd_preview_cb = sd_get_preview_callback(); auto sd_preview_mode = sd_get_preview_mode(); if (sd_preview_cb != NULL) { From b1fc7cdec0c83e2fca4ae46f54fa9ad85d995fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 25 Oct 2025 19:55:22 +0200 Subject: [PATCH 33/45] preview: support pixel space diffusion --- latent-preview.h | 28 ++++++++++++++++++---------- stable-diffusion.cpp | 18 ++++++++---------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/latent-preview.h b/latent-preview.h index 5c1606053..67011837c 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -132,17 +132,25 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl for (int i = 0; i < width; i++) { size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]); float r = 0, g = 0, b = 0; - for (int d = 0; d < dim; d++) { - float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]); - r += value * latent_rgb_proj[d][0]; - g += value * latent_rgb_proj[d][1]; - b += value * latent_rgb_proj[d][2]; + if(latent_rgb_proj!=NULL){ + for (int d = 0; d < dim; d++) { + float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]); + r += value * latent_rgb_proj[d][0]; + g += value * latent_rgb_proj[d][1]; + b += value * latent_rgb_proj[d][2]; + } + } else { + // interpret first 3 channels as RGB + r = *(float*)((char*)latents->data + latent_id + 0 * latents->nb[ggml_n_dims(latents) - 1]); + g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]); + b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]); + } + if(latent_rgb_bias!=NULL){ + // bias + r += latent_rgb_bias[0]; + g += latent_rgb_bias[1]; + b += latent_rgb_bias[2]; } - // bias - r += latent_rgb_bias[0]; - g += latent_rgb_bias[1]; - b += latent_rgb_bias[2]; - // change range r = r * .5f + .5f; g = g * .5f + .5f; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 36009bcd7..23c02f021 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1155,8 +1155,8 @@ class StableDiffusionGGML { uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; if (preview_mode == PREVIEW_PROJ) { - const float (*latent_rgb_proj)[channel]; - float *latent_rgb_bias; + const float (*latent_rgb_proj)[channel] = NULL; + float *latent_rgb_bias = NULL; if (dim == 48) { if (sd_version_is_wan(version)) { @@ -1198,6 +1198,8 @@ class StableDiffusionGGML { LOG_WARN("No latent to RGB projection known for this model"); return; } + } else if (dim == 4) { + // Do nothing, assuming already RGB latents } else { LOG_WARN("No latent to RGB projection known for this model"); // unknown latent space @@ -1227,7 +1229,7 @@ class StableDiffusionGGML { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { first_stage_model->compute(n_threads, in, true, &out, NULL); }; - silent_tiling(latents, result, 8, 32, 0.5f, on_tiling); + silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling); } else { first_stage_model->compute(n_threads, latents, true, &result, work_ctx); @@ -1246,7 +1248,7 @@ class StableDiffusionGGML { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { tae_first_stage->compute(n_threads, in, true, &out, NULL); }; - silent_tiling(latents, result, 8, 64, 0.5f, on_tiling); + silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling); } else { tae_first_stage->compute(n_threads, latents, true, &result, work_ctx); } @@ -1359,17 +1361,13 @@ class StableDiffusionGGML { struct ggml_tensor* preview_tensor = NULL; auto sd_preview_mode = sd_get_preview_mode(); if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) { - int64_t W = x->ne[0] * 8; - int64_t H = x->ne[1] * 8; + int64_t W = x->ne[0] * get_vae_scale_factor(); + int64_t H = x->ne[1] * get_vae_scale_factor(); if (ggml_n_dims(x) == 4) { // assuming video mode (if batch processing gets implemented this will break) int T = x->ne[2]; if (sd_version_is_wan(version)) { T = ((T - 1) * 4) + 1; - if (version == VERSION_WAN2_2_TI2V) { - W = x->ne[0] * 16; - H = x->ne[1] * 16; - } } preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, From 31d36b2ee7ce8ac3b49229bb9228a7ea7cdf7fe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 25 Oct 2025 19:57:40 +0200 Subject: [PATCH 34/45] include preview (and apply_mask) in speed stats properly --- stable-diffusion.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 23c02f021..abb5a0e95 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1539,12 +1539,6 @@ class StableDiffusionGGML { vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; } - int64_t t1 = ggml_time_us(); - if (step > 0 || step == -(int)steps) { - int showstep = std::abs(step); - pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep); - // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); - } if (denoise_mask != nullptr) { apply_mask(denoised, init_latent, denoise_mask); } @@ -1555,6 +1549,13 @@ class StableDiffusionGGML { preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb); } } + + int64_t t1 = ggml_time_us(); + if (step > 0 || step == -(int)steps) { + int showstep = std::abs(step); + pretty_progress(showstep, (int)steps, (t1 - t0) / 1000000.f / showstep); + // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); + } return denoised; }; From 4e3500c99e8ef80c0b0863410e20e3440aaf7c55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 25 Oct 2025 20:17:59 +0200 Subject: [PATCH 35/45] support noisy preview via API --- examples/cli/main.cpp | 7 +++++-- stable-diffusion.cpp | 23 +++++++++++++++-------- stable-diffusion.h | 4 ++-- util.cpp | 14 ++++++++++++-- util.h | 2 ++ 5 files changed, 36 insertions(+), 14 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 2aeed946e..c693aaee1 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1506,7 +1506,10 @@ bool load_images_from_dir(const std::string dir, const char* preview_path; float preview_fps; -void step_callback(int step, int frame_count, sd_image_t* image) { +void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy) { + (void)is_noisy; + // is_noisy is set to true if the preview corresponds to noisy latents, false if it's denoised latents + // unused in this app, it will either be always noisy or always denoised here if (frame_count == 1) { stbi_write_png(preview_path, image->width, image->height, image->channel, image->data, 0); } else { @@ -1541,7 +1544,7 @@ int main(int argc, const char* argv[]) { params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size(); sd_set_log_callback(sd_log_cb, (void*)¶ms); - sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval); + sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, true, false); if (params.verbose) { print_params(params); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index abb5a0e95..18da75c35 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1148,7 +1148,8 @@ class StableDiffusionGGML { enum SDVersion version, preview_t preview_mode, ggml_tensor* result, - std::function step_callback) { + std::function step_callback, + bool is_noisy) { const uint32_t channel = 3; uint32_t width = latents->ne[0]; uint32_t height = latents->ne[1]; @@ -1218,7 +1219,7 @@ class StableDiffusionGGML { for (int i = 0; i < frames; i++) { images[i] = {width, height, channel, data + i * width * height * channel}; } - step_callback(step, frames, images); + step_callback(step, frames, images, is_noisy); free(data); free(images); } else { @@ -1272,7 +1273,7 @@ class StableDiffusionGGML { images[i].data = sd_tensor_to_image(result, i, ggml_n_dims(latents) == 4); } - step_callback(step, frames, images); + step_callback(step, frames, images, is_noisy); ggml_tensor_scale(result, 0); for (int i = 0; i < frames; i++) { @@ -1384,6 +1385,8 @@ class StableDiffusionGGML { } auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { + auto sd_preview_cb = sd_get_preview_callback(); + auto sd_preview_mode = sd_get_preview_mode(); if (step == 1 || step == -1) { pretty_progress(0, (int)steps, 0); } @@ -1418,6 +1421,11 @@ class StableDiffusionGGML { if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) { apply_mask(noised_input, init_latent, denoise_mask); } + if (sd_preview_cb != NULL && sd_should_preview_noisy()) { + if (step % sd_get_preview_interval() == 0) { + preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, true); + } + } std::vector controls; @@ -1542,14 +1550,13 @@ class StableDiffusionGGML { if (denoise_mask != nullptr) { apply_mask(denoised, init_latent, denoise_mask); } - auto sd_preview_cb = sd_get_preview_callback(); - auto sd_preview_mode = sd_get_preview_mode(); - if (sd_preview_cb != NULL) { + + if (sd_preview_cb != NULL && sd_should_preview_denoised()) { if (step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb); + preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, false); } } - + int64_t t1 = ggml_time_us(); if (step > 0 || step == -(int)steps) { int showstep = std::abs(step); diff --git a/stable-diffusion.h b/stable-diffusion.h index e82a7fd81..9e99d53de 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -263,11 +263,11 @@ typedef struct sd_ctx_t sd_ctx_t; typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); -typedef void (*sd_preview_cb_t)(int, int, sd_image_t*); +typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, bool is_noisy); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); -SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval); +SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy); SD_API int32_t get_num_physical_cores(); SD_API const char* sd_get_system_info(); diff --git a/util.cpp b/util.cpp index 23b6c3b2a..0fa4bbcd7 100644 --- a/util.cpp +++ b/util.cpp @@ -189,8 +189,10 @@ static sd_progress_cb_t sd_progress_cb = nullptr; void* sd_progress_cb_data = nullptr; static sd_preview_cb_t sd_preview_cb = NULL; -preview_t sd_preview_mode = PREVIEW_NONE; +preview_t sd_preview_mode = PREVIEW_NONE; int sd_preview_interval = 1; +bool sd_preview_denoised = true; +bool sd_preview_noisy = false; std::u32string utf8_to_utf32(const std::string& utf8_str) { std::wstring_convert, char32_t> converter; @@ -335,10 +337,12 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) { sd_progress_cb = cb; sd_progress_cb_data = data; } -void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1) { +void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode = PREVIEW_PROJ, int interval = 1, bool denoised = true, bool noisy = false) { sd_preview_cb = cb; sd_preview_mode = mode; sd_preview_interval = interval; + sd_preview_denoised = denoised; + sd_preview_noisy = noisy; } sd_preview_cb_t sd_get_preview_callback() { @@ -351,6 +355,12 @@ preview_t sd_get_preview_mode() { int sd_get_preview_interval() { return sd_preview_interval; } +bool sd_should_preview_denoised() { + return sd_preview_denoised; +} +bool sd_should_preview_noisy() { + return sd_preview_noisy; +} sd_progress_cb_t sd_get_progress_callback() { return sd_progress_cb; diff --git a/util.h b/util.h index 3e34a2f7b..5bd69a624 100644 --- a/util.h +++ b/util.h @@ -60,6 +60,8 @@ void* sd_get_progress_callback_data(); sd_preview_cb_t sd_get_preview_callback(); preview_t sd_get_preview_mode(); int sd_get_preview_interval(); +bool sd_should_preview_denoised(); +bool sd_should_preview_noisy(); #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) From 27af5a452f8dd67314b30fc5ed8c327673d27967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 25 Oct 2025 20:28:27 +0200 Subject: [PATCH 36/45] missing includes --- latent-preview.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/latent-preview.h b/latent-preview.h index 67011837c..628e72a9c 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -1,3 +1,6 @@ +#include +#include +#include "ggml.h" const float wan_21_latent_rgb_proj[16][3] = { {-0.1299, -0.1692, 0.2932}, {0.0671, 0.0406, 0.0442}, From 07c61f1afe1277450d2b31d8bc3b11fdba9ba8a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 25 Oct 2025 20:49:33 +0200 Subject: [PATCH 37/45] supports noisy preview in main --- examples/cli/main.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index c693aaee1..89de4c251 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -146,13 +146,13 @@ struct SDParams { int preview_interval = 1; std::string preview_path = "preview.png"; bool taesd_preview = false; + bool preview_noisy = false; SDParams() { sd_sample_params_init(&sample_params); sd_sample_params_init(&high_noise_sample_params); high_noise_sample_params.sample_steps = -1; } - }; void print_params(SDParams params) { @@ -223,7 +223,7 @@ void print_params(SDParams params) { printf(" video_frames: %d\n", params.video_frames); printf(" vace_strength: %.2f\n", params.vace_strength); printf(" fps: %d\n", params.fps); - printf(" preview_mode: %s\n", previews_str[params.preview_method]); + printf(" preview_mode: %s (%s)\n", previews_str[params.preview_method], params.preview_noisy ? "noisy" : "denoised"); printf(" preview_interval: %d\n", params.preview_interval); free(sample_params_str); free(high_noise_sample_params_str); @@ -604,7 +604,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { "--negative-prompt", "the negative prompt (default: \"\")", ¶ms.negative_prompt}, - {"", + {"", "--preview-path", "path to write preview image to (default: ./preview.png)", ¶ms.preview_path}, @@ -669,7 +669,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--preview-interval", "interval in denoising steps between consecutive updates of the image preview file (default is 1, meaning updating at every step)", - ¶ms.preview_interval}, + ¶ms.preview_interval}, }; options.float_options = { @@ -826,8 +826,12 @@ void parse_args(int argc, const char** argv, SDParams& params) { false, ¶ms.auto_resize_ref_image}, {"", "--taesd-preview-only", - std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")", - false, ¶ms.taesd_preview}, + std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")", + true, ¶ms.taesd_preview}, + {"", + "--preview-noisy", + "enables previewing noisy inputs of the models rather than the denoised outputs", + true, ¶ms.preview_noisy} }; auto on_mode_arg = [&](int argc, const char** argv, int index) { @@ -1507,6 +1511,7 @@ const char* preview_path; float preview_fps; void step_callback(int step, int frame_count, sd_image_t* image, bool is_noisy) { + (void)step; (void)is_noisy; // is_noisy is set to true if the preview corresponds to noisy latents, false if it's denoised latents // unused in this app, it will either be always noisy or always denoised here @@ -1531,7 +1536,8 @@ int main(int argc, const char* argv[]) { std::transform(file_ext.begin(), file_ext.end(), file_ext.begin(), ::tolower); } if (file_ext == ".png") { - preview_path = (base_path + ".avi").c_str(); + base_path = base_path + ".avi"; + preview_path = base_path.c_str(); } } preview_fps = params.fps; @@ -1544,7 +1550,7 @@ int main(int argc, const char* argv[]) { params.high_noise_sample_params.guidance.slg.layer_count = params.high_noise_skip_layers.size(); sd_set_log_callback(sd_log_cb, (void*)¶ms); - sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, true, false); + sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval, !params.preview_noisy, params.preview_noisy); if (params.verbose) { print_params(params); From f80f61a66998d4364db14d0bd765674f0be0fd2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 25 Oct 2025 20:50:18 +0200 Subject: [PATCH 38/45] fix tae-preview-only (bad merge issue) --- stable-diffusion.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 18da75c35..af5ef00b1 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -484,7 +484,7 @@ class StableDiffusionGGML { vae_decode_only = false; } - if (high_noise_diffusion_model || sd_ctx_params->tae_preview_only) { + if (high_noise_diffusion_model) { high_noise_diffusion_model->alloc_params_buffer(); high_noise_diffusion_model->get_param_tensors(tensors); } @@ -508,7 +508,7 @@ class StableDiffusionGGML { } else if (version == VERSION_CHROMA_RADIANCE) { first_stage_model = std::make_shared(vae_backend, offload_params_to_cpu); - } else if (!use_tiny_autoencoder) { + } else if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) { first_stage_model = std::make_shared(vae_backend, offload_params_to_cpu, model_loader.tensor_storages_types, From 6c68e395ab87392e1e466097bc11efc66ed357aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 25 Oct 2025 20:53:49 +0200 Subject: [PATCH 39/45] format code --- examples/cli/main.cpp | 9 ++++----- latent-preview.h | 6 +++--- stable-diffusion.cpp | 18 +++++++++--------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 89de4c251..619c42847 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -828,11 +828,10 @@ void parse_args(int argc, const char** argv, SDParams& params) { "--taesd-preview-only", std::string("prevents usage of taesd for decoding the final image. (for use with --preview ") + previews_str[PREVIEW_TAE] + ")", true, ¶ms.taesd_preview}, - {"", - "--preview-noisy", - "enables previewing noisy inputs of the models rather than the denoised outputs", - true, ¶ms.preview_noisy} - }; + {"", + "--preview-noisy", + "enables previewing noisy inputs of the models rather than the denoised outputs", + true, ¶ms.preview_noisy}}; auto on_mode_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { diff --git a/latent-preview.h b/latent-preview.h index 628e72a9c..aa0939eba 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -126,7 +126,7 @@ const float sd_latent_rgb_proj[4][3]{ {0.3250f, 0.4974f, 0.2350f}, {-0.2829f, 0.1762f, 0.2721f}, {-0.2120f, -0.2616f, -0.7177f}}; -float sd_latent_rgb_bias[3] = {0,0,0}; +float sd_latent_rgb_bias[3] = {0, 0, 0}; void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) { size_t buffer_head = 0; @@ -135,7 +135,7 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl for (int i = 0; i < width; i++) { size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]); float r = 0, g = 0, b = 0; - if(latent_rgb_proj!=NULL){ + if (latent_rgb_proj != NULL) { for (int d = 0; d < dim; d++) { float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]); r += value * latent_rgb_proj[d][0]; @@ -148,7 +148,7 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]); b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]); } - if(latent_rgb_bias!=NULL){ + if (latent_rgb_bias != NULL) { // bias r += latent_rgb_bias[0]; g += latent_rgb_bias[1]; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index af5ef00b1..e89e5fb33 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -369,7 +369,7 @@ class StableDiffusionGGML { diffusion_model = std::make_shared(backend, offload_params_to_cpu, sd_ctx_params->diffusion_flash_attn, - model_loader.tensor_storages_types); + model_loader.tensor_storages_types); } else if (sd_version_is_flux(version)) { bool is_chroma = false; for (auto pair : model_loader.tensor_storages_types) { @@ -443,11 +443,11 @@ class StableDiffusionGGML { "", enable_vision); diffusion_model = std::make_shared(backend, - offload_params_to_cpu, - model_loader.tensor_storages_types, - "model.diffusion_model", - version, - sd_ctx_params->diffusion_flash_attn); + offload_params_to_cpu, + model_loader.tensor_storages_types, + "model.diffusion_model", + version, + sd_ctx_params->diffusion_flash_attn); } else { // SD1.x SD2.x SDXL if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) { cond_stage_model = std::make_shared(clip_backend, @@ -1157,7 +1157,7 @@ class StableDiffusionGGML { if (preview_mode == PREVIEW_PROJ) { const float (*latent_rgb_proj)[channel] = NULL; - float *latent_rgb_bias = NULL; + float* latent_rgb_bias = NULL; if (dim == 48) { if (sd_version_is_wan(version)) { @@ -1214,7 +1214,7 @@ class StableDiffusionGGML { uint8_t* data = (uint8_t*)malloc(frames * width * height * channel * sizeof(uint8_t)); - preview_latent_video(data, latents, latent_rgb_proj,latent_rgb_bias, width, height, frames, dim); + preview_latent_video(data, latents, latent_rgb_proj, latent_rgb_bias, width, height, frames, dim); sd_image_t* images = (sd_image_t*)malloc(frames * sizeof(sd_image_t)); for (int i = 0; i < frames; i++) { images[i] = {width, height, channel, data + i * width * height * channel}; @@ -1274,7 +1274,7 @@ class StableDiffusionGGML { } step_callback(step, frames, images, is_noisy); - + ggml_tensor_scale(result, 0); for (int i = 0; i < frames; i++) { free(images[i].data); From fc2a71e56f0c53d7b6687cd7251ed4c0049bb6a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 25 Oct 2025 21:36:39 +0200 Subject: [PATCH 40/45] update help in readme --- examples/cli/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/cli/README.md b/examples/cli/README.md index abbb0b6f8..00e0942f1 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -90,6 +90,7 @@ Options: --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1). --disable-auto-resize-ref-image disable auto resize of ref images --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview tae) + --preview-noisy enables previewing noisy inputs of the models rather than the denoised outputs -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the type of the weight file From 8a3346f8cd6c833d582ed92ec5e9a735fc2e10d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 28 Oct 2025 14:55:06 +0100 Subject: [PATCH 41/45] use bespoke latent to rgb projection to prevent licensing issues --- latent-preview.h | 205 ++++++++++++++++++++++++----------------------- 1 file changed, 105 insertions(+), 100 deletions(-) diff --git a/latent-preview.h b/latent-preview.h index aa0939eba..4f83cfc6d 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -1,97 +1,102 @@ #include #include #include "ggml.h" + const float wan_21_latent_rgb_proj[16][3] = { - {-0.1299, -0.1692, 0.2932}, - {0.0671, 0.0406, 0.0442}, - {0.3568, 0.2548, 0.1747}, - {0.0372, 0.2344, 0.1420}, - {0.0313, 0.0189, -0.0328}, - {0.0296, -0.0956, -0.0665}, - {-0.3477, -0.4059, -0.2925}, - {0.0166, 0.1902, 0.1975}, - {-0.0412, 0.0267, -0.1364}, - {-0.1293, 0.0740, 0.1636}, - {0.0680, 0.3019, 0.1128}, - {0.0032, 0.0581, 0.0639}, - {-0.1251, 0.0927, 0.1699}, - {0.0060, -0.0633, 0.0005}, - {0.3477, 0.2275, 0.2950}, - {0.1984, 0.0913, 0.1861}}; -float wan_21_latent_rgb_bias[3] = {-0.1223, -0.1889, -0.1976}; + {0.015123f, -0.148418f, 0.479828f}, + {0.003652f, -0.010680f, -0.037142f}, + {0.212264f, 0.063033f, 0.016779f}, + {0.232999f, 0.406476f, 0.220125f}, + {-0.051864f, -0.082384f, -0.069396f}, + {0.085005f, -0.161492f, 0.010689f}, + {-0.245369f, -0.506846f, -0.117010f}, + {-0.151145f, 0.017721f, 0.007207f}, + {-0.293239f, -0.207936f, -0.421135f}, + {-0.187721f, 0.050783f, 0.177649f}, + {-0.013067f, 0.265964f, 0.166578f}, + {0.028327f, 0.109329f, 0.108642f}, + {-0.205343f, 0.043991f, 0.148914f}, + {0.014307f, -0.048647f, -0.007219f}, + {0.217150f, 0.053074f, 0.319923f}, + {0.155357f, 0.083156f, 0.064780f} +}; +float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f}; const float wan_22_latent_rgb_proj[48][3] = { - {0.0119, 0.0103, 0.0046}, - {-0.1062, -0.0504, 0.0165}, - {0.0140, 0.0409, 0.0491}, - {-0.0813, -0.0677, 0.0607}, - {0.0656, 0.0851, 0.0808}, - {0.0264, 0.0463, 0.0912}, - {0.0295, 0.0326, 0.0590}, - {-0.0244, -0.0270, 0.0025}, - {0.0443, -0.0102, 0.0288}, - {-0.0465, -0.0090, -0.0205}, - {0.0359, 0.0236, 0.0082}, - {-0.0776, 0.0854, 0.1048}, - {0.0564, 0.0264, 0.0561}, - {0.0006, 0.0594, 0.0418}, - {-0.0319, -0.0542, -0.0637}, - {-0.0268, 0.0024, 0.0260}, - {0.0539, 0.0265, 0.0358}, - {-0.0359, -0.0312, -0.0287}, - {-0.0285, -0.1032, -0.1237}, - {0.1041, 0.0537, 0.0622}, - {-0.0086, -0.0374, -0.0051}, - {0.0390, 0.0670, 0.2863}, - {0.0069, 0.0144, 0.0082}, - {0.0006, -0.0167, 0.0079}, - {0.0313, -0.0574, -0.0232}, - {-0.1454, -0.0902, -0.0481}, - {0.0714, 0.0827, 0.0447}, - {-0.0304, -0.0574, -0.0196}, - {0.0401, 0.0384, 0.0204}, - {-0.0758, -0.0297, -0.0014}, - {0.0568, 0.1307, 0.1372}, - {-0.0055, -0.0310, -0.0380}, - {0.0239, -0.0305, 0.0325}, - {-0.0663, -0.0673, -0.0140}, - {-0.0416, -0.0047, -0.0023}, - {0.0166, 0.0112, -0.0093}, - {-0.0211, 0.0011, 0.0331}, - {0.1833, 0.1466, 0.2250}, - {-0.0368, 0.0370, 0.0295}, - {-0.3441, -0.3543, -0.2008}, - {-0.0479, -0.0489, -0.0420}, - {-0.0660, -0.0153, 0.0800}, - {-0.0101, 0.0068, 0.0156}, - {-0.0690, -0.0452, -0.0927}, - {-0.0145, 0.0041, 0.0015}, - {0.0421, 0.0451, 0.0373}, - {0.0504, -0.0483, -0.0356}, - {-0.0837, 0.0168, 0.0055}}; -float wan_22_latent_rgb_bias[3] = {0.0317, -0.0878, -0.1388}; + {0.017126f, -0.027230f, -0.019257f}, + {-0.113739f, -0.028715f, -0.022885f}, + {-0.000106f, 0.021494f, 0.004629f}, + {-0.013273f, -0.107137f, -0.033638f}, + {-0.000381f, 0.000279f, 0.025877f}, + {-0.014216f, -0.003975f, 0.040528f}, + {0.001638f, -0.000748f, 0.011022f}, + {0.029238f, -0.006697f, 0.035933f}, + {0.021641f, -0.015874f, 0.040531f}, + {-0.101984f, -0.070160f, -0.028855f}, + {0.033207f, -0.021068f, 0.002663f}, + {-0.104711f, 0.121673f, 0.102981f}, + {0.082647f, -0.004991f, 0.057237f}, + {-0.027375f, 0.031581f, 0.006868f}, + {-0.045434f, 0.029444f, 0.019287f}, + {-0.046572f, -0.012537f, 0.006675f}, + {0.074709f, 0.033690f, 0.025289f}, + {-0.008251f, -0.002745f, -0.006999f}, + {0.012685f, -0.061856f, -0.048658f}, + {0.042304f, -0.007039f, 0.000295f}, + {-0.007644f, -0.060843f, -0.033142f}, + {0.159909f, 0.045628f, 0.367541f}, + {0.095171f, 0.086438f, 0.010271f}, + {0.006812f, 0.019643f, 0.029637f}, + {0.003467f, -0.010705f, 0.014252f}, + {-0.099681f, -0.066272f, -0.006243f}, + {0.047357f, 0.037040f, 0.000185f}, + {-0.041797f, -0.089225f, -0.032257f}, + {0.008928f, 0.017028f, 0.018684f}, + {-0.042255f, 0.016045f, 0.006849f}, + {0.011268f, 0.036462f, 0.037387f}, + {0.011553f, -0.016375f, -0.048589f}, + {0.046266f, -0.027189f, 0.056979f}, + {0.009640f, -0.017576f, 0.030324f}, + {-0.045794f, -0.036083f, -0.010616f}, + {0.022418f, 0.039783f, -0.032939f}, + {-0.052714f, -0.015525f, 0.007438f}, + {0.193004f, 0.223541f, 0.264175f}, + {-0.059406f, -0.008188f, 0.022867f}, + {-0.156742f, -0.263791f, -0.007385f}, + {-0.015717f, 0.016570f, 0.033969f}, + {0.037969f, 0.109835f, 0.200449f}, + {-0.000782f, -0.009566f, -0.008058f}, + {0.010709f, 0.052960f, -0.044195f}, + {0.017271f, 0.045839f, 0.034569f}, + {0.009424f, 0.013088f, -0.001714f}, + {-0.024805f, -0.059378f, -0.033756f}, + {-0.078293f, 0.029070f, 0.026129f} +}; +float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f}; -// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 const float flux_latent_rgb_proj[16][3] = { - {-0.0346f, 0.0244f, 0.0681f}, - {0.0034f, 0.0210f, 0.0687f}, - {0.0275f, -0.0668f, -0.0433f}, - {-0.0174f, 0.0160f, 0.0617f}, - {0.0859f, 0.0721f, 0.0329f}, - {0.0004f, 0.0383f, 0.0115f}, - {0.0405f, 0.0861f, 0.0915f}, - {-0.0236f, -0.0185f, -0.0259f}, - {-0.0245f, 0.0250f, 0.1180f}, - {0.1008f, 0.0755f, -0.0421f}, - {-0.0515f, 0.0201f, 0.0011f}, - {0.0428f, -0.0012f, -0.0036f}, - {0.0817f, 0.0765f, 0.0749f}, - {-0.1264f, -0.0522f, -0.1103f}, - {-0.0280f, -0.0881f, -0.0499f}, - {-0.1262f, -0.0982f, -0.0778f}}; -float flux_latent_rgb_bias[3] = {-0.0329, -0.0718, -0.0851}; + {-0.041168f, 0.019917f, 0.097253f}, + {0.028096f, 0.026730f, 0.129576f}, + {0.065618f, -0.067950f, -0.014651f}, + {-0.012998f, -0.014762f, 0.081251f}, + {0.078567f, 0.059296f, -0.024687f}, + {-0.015987f, -0.003697f, 0.005012f}, + {0.033605f, 0.138999f, 0.068517f}, + {-0.024450f, -0.063567f, -0.030101f}, + {-0.040194f, -0.016710f, 0.127185f}, + {0.112681f, 0.088764f, -0.041940f}, + {-0.023498f, 0.093664f, 0.025543f}, + {0.082899f, 0.048320f, 0.007491f}, + {0.075712f, 0.074139f, 0.081965f}, + {-0.143501f, 0.018263f, -0.136138f}, + {-0.025767f, -0.082035f, -0.040023f}, + {-0.111849f, -0.055589f, -0.032361f} +}; +float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f}; -// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 +// This one was taken straight from +// https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303 +// (MiT Licence) const float sd3_latent_rgb_proj[16][3] = { {-0.0645f, 0.0177f, 0.1052f}, {0.0028f, 0.0312f, 0.0650f}, @@ -110,23 +115,23 @@ const float sd3_latent_rgb_proj[16][3] = { {-0.0749f, -0.0634f, -0.0456f}, {-0.1418f, -0.1457f, -0.1259f}, }; -float sd3_latent_rgb_bias[3] = {0, 0, 0}; +float sd3_latent_rgb_bias[3] = NULL; -// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 -const float sdxl_latent_rgb_proj[4][3] = { - {0.3651f, 0.4232f, 0.4341f}, - {-0.2533f, -0.0042f, 0.1068f}, - {0.1076f, 0.1111f, -0.0362f}, - {-0.3165f, -0.2492f, -0.2188f}}; -float sdxl_latent_rgb_bias[3] = {0.1084, -0.0175, -0.0011}; +const float sdxl_latent_rgb_proj[4][3] = { + {0.258303f, 0.277640f, 0.329699f}, + {-0.299701f, 0.105446f, 0.014194f}, + {0.050522f, 0.186163f, -0.143257f}, + {-0.211938f, -0.149892f, -0.080036f} +}; +float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f}; -// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 -const float sd_latent_rgb_proj[4][3]{ - {0.3512f, 0.2297f, 0.3227f}, - {0.3250f, 0.4974f, 0.2350f}, - {-0.2829f, 0.1762f, 0.2721f}, - {-0.2120f, -0.2616f, -0.7177f}}; -float sd_latent_rgb_bias[3] = {0, 0, 0}; +const float sd_latent_rgb_proj[4][3] = { + {0.337366f, 0.216344f, 0.257386f}, + {0.165636f, 0.386828f, 0.046994f}, + {-0.267803f, 0.237036f, 0.223517f}, + {-0.178022f, -0.200862f, -0.678514f} +}; +float sd_latent_rgb_bias[3] ={-0.017478f, -0.055834f, -0.105825f}; void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) { size_t buffer_head = 0; From b5e73f9c1acf80f9b2cd5c3e8b01c1614c1c60b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 28 Oct 2025 15:04:08 +0100 Subject: [PATCH 42/45] fix sd3 null bias breaking build --- latent-preview.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/latent-preview.h b/latent-preview.h index 4f83cfc6d..354f8c3a4 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -13,7 +13,7 @@ const float wan_21_latent_rgb_proj[16][3] = { {-0.151145f, 0.017721f, 0.007207f}, {-0.293239f, -0.207936f, -0.421135f}, {-0.187721f, 0.050783f, 0.177649f}, - {-0.013067f, 0.265964f, 0.166578f}, + {-0.013067f, 0.265964f, 0.166578f }, {0.028327f, 0.109329f, 0.108642f}, {-0.205343f, 0.043991f, 0.148914f}, {0.014307f, -0.048647f, -0.007219f}, @@ -115,7 +115,7 @@ const float sd3_latent_rgb_proj[16][3] = { {-0.0749f, -0.0634f, -0.0456f}, {-0.1418f, -0.1457f, -0.1259f}, }; -float sd3_latent_rgb_bias[3] = NULL; +float sd3_latent_rgb_bias[3] = {0, 0, 0}; const float sdxl_latent_rgb_proj[4][3] = { {0.258303f, 0.277640f, 0.329699f}, From c1226d6f5332a5366bbf4c5ca6af59b311f379f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 28 Oct 2025 16:40:06 +0100 Subject: [PATCH 43/45] use new ggml_ext function names --- stable-diffusion.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 3bae2af07..c325a284c 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1250,7 +1250,7 @@ class StableDiffusionGGML { return; } - ggml_tensor_clamp(result, 0.0f, 1.0f); + ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f); uint32_t frames = 1; if (ggml_n_dims(latents) == 4) { frames = result->ne[2]; @@ -1262,12 +1262,12 @@ class StableDiffusionGGML { images[i].width = result->ne[0]; images[i].height = result->ne[1]; images[i].channel = 3; - images[i].data = sd_tensor_to_image(result, i, ggml_n_dims(latents) == 4); + images[i].data = ggml_tensor_to_sd_image(result, i, ggml_n_dims(latents) == 4); } step_callback(step, frames, images, is_noisy); - ggml_tensor_scale(result, 0); + ggml_ext_tensor_scale_inplace(result, 0); for (int i = 0; i < frames; i++) { free(images[i].data); } From 3db7fb14370a979edc3cd51437aaedfbaf42766d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 29 Oct 2025 11:20:15 +0100 Subject: [PATCH 44/45] Fix radiance proj support --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index c325a284c..9b6e2fc56 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1191,7 +1191,7 @@ class StableDiffusionGGML { LOG_WARN("No latent to RGB projection known for this model"); return; } - } else if (dim == 4) { + } else if (dim == 3) { // Do nothing, assuming already RGB latents } else { LOG_WARN("No latent to RGB projection known for this model"); From 044f0ed89055d6513b96620701e737cebe5803da Mon Sep 17 00:00:00 2001 From: leejet Date: Mon, 10 Nov 2025 00:00:29 +0800 Subject: [PATCH 45/45] format code --- latent-preview.h | 31 +++++++++++++------------------ stable-diffusion.cpp | 40 ++++++++++++++++++++-------------------- util.cpp | 2 +- 3 files changed, 34 insertions(+), 39 deletions(-) diff --git a/latent-preview.h b/latent-preview.h index 354f8c3a4..97409a7d8 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -13,14 +13,13 @@ const float wan_21_latent_rgb_proj[16][3] = { {-0.151145f, 0.017721f, 0.007207f}, {-0.293239f, -0.207936f, -0.421135f}, {-0.187721f, 0.050783f, 0.177649f}, - {-0.013067f, 0.265964f, 0.166578f }, + {-0.013067f, 0.265964f, 0.166578f}, {0.028327f, 0.109329f, 0.108642f}, {-0.205343f, 0.043991f, 0.148914f}, {0.014307f, -0.048647f, -0.007219f}, {0.217150f, 0.053074f, 0.319923f}, - {0.155357f, 0.083156f, 0.064780f} -}; -float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f}; + {0.155357f, 0.083156f, 0.064780f}}; +float wan_21_latent_rgb_bias[3] = {-0.270270f, -0.234976f, -0.456853f}; const float wan_22_latent_rgb_proj[48][3] = { {0.017126f, -0.027230f, -0.019257f}, @@ -70,9 +69,8 @@ const float wan_22_latent_rgb_proj[48][3] = { {0.017271f, 0.045839f, 0.034569f}, {0.009424f, 0.013088f, -0.001714f}, {-0.024805f, -0.059378f, -0.033756f}, - {-0.078293f, 0.029070f, 0.026129f} -}; -float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f}; + {-0.078293f, 0.029070f, 0.026129f}}; +float wan_22_latent_rgb_bias[3] = {0.013160f, -0.096492f, -0.071323f}; const float flux_latent_rgb_proj[16][3] = { {-0.041168f, 0.019917f, 0.097253f}, @@ -90,11 +88,10 @@ const float flux_latent_rgb_proj[16][3] = { {0.075712f, 0.074139f, 0.081965f}, {-0.143501f, 0.018263f, -0.136138f}, {-0.025767f, -0.082035f, -0.040023f}, - {-0.111849f, -0.055589f, -0.032361f} -}; + {-0.111849f, -0.055589f, -0.032361f}}; float flux_latent_rgb_bias[3] = {0.024600f, -0.006937f, -0.008089f}; -// This one was taken straight from +// This one was taken straight from // https://github.com/Stability-AI/sd3.5/blob/8565799a3b41eb0c7ba976d18375f0f753f56402/sd3_impls.py#L288-L303 // (MiT Licence) const float sd3_latent_rgb_proj[16][3] = { @@ -117,21 +114,19 @@ const float sd3_latent_rgb_proj[16][3] = { }; float sd3_latent_rgb_bias[3] = {0, 0, 0}; -const float sdxl_latent_rgb_proj[4][3] = { +const float sdxl_latent_rgb_proj[4][3] = { {0.258303f, 0.277640f, 0.329699f}, {-0.299701f, 0.105446f, 0.014194f}, {0.050522f, 0.186163f, -0.143257f}, - {-0.211938f, -0.149892f, -0.080036f} -}; + {-0.211938f, -0.149892f, -0.080036f}}; float sdxl_latent_rgb_bias[3] = {0.144381f, -0.033313f, 0.007061f}; const float sd_latent_rgb_proj[4][3] = { {0.337366f, 0.216344f, 0.257386f}, {0.165636f, 0.386828f, 0.046994f}, {-0.267803f, 0.237036f, 0.223517f}, - {-0.178022f, -0.200862f, -0.678514f} -}; -float sd_latent_rgb_bias[3] ={-0.017478f, -0.055834f, -0.105825f}; + {-0.178022f, -0.200862f, -0.678514f}}; +float sd_latent_rgb_bias[3] = {-0.017478f, -0.055834f, -0.105825f}; void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], const float latent_rgb_bias[3], int width, int height, int frames, int dim) { size_t buffer_head = 0; @@ -140,7 +135,7 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl for (int i = 0; i < width; i++) { size_t latent_id = (i * latents->nb[0] + j * latents->nb[1] + k * latents->nb[2]); float r = 0, g = 0, b = 0; - if (latent_rgb_proj != NULL) { + if (latent_rgb_proj != nullptr) { for (int d = 0; d < dim; d++) { float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[ggml_n_dims(latents) - 1]); r += value * latent_rgb_proj[d][0]; @@ -153,7 +148,7 @@ void preview_latent_video(uint8_t* buffer, struct ggml_tensor* latents, const fl g = *(float*)((char*)latents->data + latent_id + 1 * latents->nb[ggml_n_dims(latents) - 1]); b = *(float*)((char*)latents->data + latent_id + 2 * latents->nb[ggml_n_dims(latents) - 1]); } - if (latent_rgb_bias != NULL) { + if (latent_rgb_bias != nullptr) { // bias r += latent_rgb_bias[0]; g += latent_rgb_bias[1]; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 91a6eff05..b675b85b0 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1124,7 +1124,7 @@ class StableDiffusionGGML { void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) { sd_progress_cb_t cb = sd_get_progress_callback(); void* cbd = sd_get_progress_callback_data(); - sd_set_progress_callback((sd_progress_cb_t)suppress_pp, NULL); + sd_set_progress_callback((sd_progress_cb_t)suppress_pp, nullptr); sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing); sd_set_progress_callback(cb, cbd); } @@ -1143,8 +1143,8 @@ class StableDiffusionGGML { uint32_t dim = latents->ne[ggml_n_dims(latents) - 1]; if (preview_mode == PREVIEW_PROJ) { - const float (*latent_rgb_proj)[channel] = NULL; - float* latent_rgb_bias = NULL; + const float(*latent_rgb_proj)[channel] = nullptr; + float* latent_rgb_bias = nullptr; if (dim == 48) { if (sd_version_is_wan(version)) { @@ -1215,7 +1215,7 @@ class StableDiffusionGGML { if (vae_tiling_params.enabled) { // split latent in 32x32 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - first_stage_model->compute(n_threads, in, true, &out, NULL); + first_stage_model->compute(n_threads, in, true, &out, nullptr); }; silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling); @@ -1234,7 +1234,7 @@ class StableDiffusionGGML { if (vae_tiling_params.enabled) { // split latent in 64x64 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { - tae_first_stage->compute(n_threads, in, true, &out, NULL); + tae_first_stage->compute(n_threads, in, true, &out, nullptr); }; silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling); } else { @@ -1346,7 +1346,7 @@ class StableDiffusionGGML { int64_t t0 = ggml_time_us(); - struct ggml_tensor* preview_tensor = NULL; + struct ggml_tensor* preview_tensor = nullptr; auto sd_preview_mode = sd_get_preview_mode(); if (sd_preview_mode != PREVIEW_NONE && sd_preview_mode != PREVIEW_PROJ) { int64_t W = x->ne[0] * get_vae_scale_factor(); @@ -1408,7 +1408,7 @@ class StableDiffusionGGML { if (denoise_mask != nullptr && version == VERSION_WAN2_2_TI2V) { apply_mask(noised_input, init_latent, denoise_mask); } - if (sd_preview_cb != NULL && sd_should_preview_noisy()) { + if (sd_preview_cb != nullptr && sd_should_preview_noisy()) { if (step % sd_get_preview_interval() == 0) { preview_image(work_ctx, step, noised_input, version, sd_preview_mode, preview_tensor, sd_preview_cb, true); } @@ -1538,7 +1538,7 @@ class StableDiffusionGGML { apply_mask(denoised, init_latent, denoise_mask); } - if (sd_preview_cb != NULL && sd_should_preview_denoised()) { + if (sd_preview_cb != nullptr && sd_should_preview_denoised()) { if (step % sd_get_preview_interval() == 0) { preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb, false); } @@ -1629,12 +1629,12 @@ class StableDiffusionGGML { -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}; latents_std_vec = { - 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, - 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, - 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, - 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, - 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, - 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; + 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, + 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, + 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, + 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, + 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, + 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; } for (int i = 0; i < latent->ne[3]; i++) { float mean = latents_mean_vec[i]; @@ -1675,12 +1675,12 @@ class StableDiffusionGGML { -0.0313f, -0.1649f, 0.0117f, 0.0723f, -0.2839f, -0.2083f, -0.0520f, 0.3748f, 0.0152f, 0.1957f, 0.1433f, -0.2944f, 0.3573f, -0.0548f, -0.1681f, -0.0667f}; latents_std_vec = { - 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, - 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, - 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, - 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, - 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, - 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; + 0.4765f, 1.0364f, 0.4514f, 1.1677f, 0.5313f, 0.4990f, 0.4818f, 0.5013f, + 0.8158f, 1.0344f, 0.5894f, 1.0901f, 0.6885f, 0.6165f, 0.8454f, 0.4978f, + 0.5759f, 0.3523f, 0.7135f, 0.6804f, 0.5833f, 1.4146f, 0.8986f, 0.5659f, + 0.7069f, 0.5338f, 0.4889f, 0.4917f, 0.4069f, 0.4999f, 0.6866f, 0.4093f, + 0.5709f, 0.6065f, 0.6415f, 0.4944f, 0.5726f, 1.2042f, 0.5458f, 1.6887f, + 0.3971f, 1.0600f, 0.3943f, 0.5537f, 0.5444f, 0.4089f, 0.7468f, 0.7744f}; } for (int i = 0; i < latent->ne[3]; i++) { float mean = latents_mean_vec[i]; diff --git a/util.cpp b/util.cpp index b30f5cef9..1aa9beff8 100644 --- a/util.cpp +++ b/util.cpp @@ -185,7 +185,7 @@ int32_t get_num_physical_cores() { static sd_progress_cb_t sd_progress_cb = nullptr; void* sd_progress_cb_data = nullptr; -static sd_preview_cb_t sd_preview_cb = NULL; +static sd_preview_cb_t sd_preview_cb = nullptr; preview_t sd_preview_mode = PREVIEW_NONE; int sd_preview_interval = 1; bool sd_preview_denoised = true;