From 7d56d08f329fde7649817549021e7a46c222f254 Mon Sep 17 00:00:00 2001 From: RodrigoSantiago Date: Wed, 2 Jul 2025 17:34:14 -0300 Subject: [PATCH 1/3] Preview --- .gitignore | 5 +- .idea/.gitignore | 8 + .idea/codeStyles/Project.xml | 7 + .idea/codeStyles/codeStyleConfig.xml | 5 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/stable-diffusion.cpp.iml | 2 + .idea/vcs.xml | 7 + README.md | 7 +- ggml_extend.hpp | 2 +- latent-preview.h | 83 ++++++++++ stable-diffusion.cpp | 225 ++++++++++++++++++++++----- stable-diffusion.h | 28 +++- util.cpp | 27 ++++ util.h | 7 + 15 files changed, 374 insertions(+), 51 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/codeStyles/Project.xml create mode 100644 .idea/codeStyles/codeStyleConfig.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/stable-diffusion.cpp.iml create mode 100644 .idea/vcs.xml create mode 100644 latent-preview.h diff --git a/.gitignore b/.gitignore index 38fe570df..e1c30e700 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,7 @@ test/ *.gguf output*.png models* -*.log \ No newline at end of file +*.log +preview.png +cmake-build-debug/ +cmake-build-debug-mingw/ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 000000000..13566b81b --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml new file mode 100644 index 000000000..f60388162 --- /dev/null +++ b/.idea/codeStyles/Project.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 000000000..79ee123c2 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 000000000..79b3c9483 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 000000000..c930abe40 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/stable-diffusion.cpp.iml b/.idea/stable-diffusion.cpp.iml new file mode 100644 index 000000000..f08604bb6 --- /dev/null +++ b/.idea/stable-diffusion.cpp.iml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 000000000..2b99a06e0 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index 4720dc29c..8ac08a051 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ Inference of Stable Diffusion and Flux in pure C/C++ - Linux - Mac OS - Windows - - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion)) + - Android (via Termux) ### TODO @@ -392,12 +392,10 @@ Using formats of different precisions will yield results of varying quality. These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks. -* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion) -* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion) +* Golang: [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion) * C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET) * Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python) * Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs) -* Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion) ## UIs @@ -406,7 +404,6 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio - [Jellybox](https://jellybox.com) - [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx) - [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp) -- [Local Diffusion](https://github.com/rmatif/Local-Diffusion) ## Contributors diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 9f6a4fef6..424ebd5e4 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -634,7 +634,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1); on_processing(input_tile, NULL, true); int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap); - LOG_INFO("processing %i tiles", num_tiles); + LOG_DEBUG("processing %i tiles", num_tiles); pretty_progress(1, num_tiles, 0.0f); int tile_count = 1; bool last_y = false, last_x = false; diff --git a/latent-preview.h b/latent-preview.h new file mode 100644 index 000000000..7dee66f6d --- /dev/null +++ b/latent-preview.h @@ -0,0 +1,83 @@ + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 +const float flux_latent_rgb_proj[16][3] = { + {-0.0346f, 0.0244f, 0.0681f}, + {0.0034f, 0.0210f, 0.0687f}, + {0.0275f, -0.0668f, -0.0433f}, + {-0.0174f, 0.0160f, 0.0617f}, + {0.0859f, 0.0721f, 0.0329f}, + {0.0004f, 0.0383f, 0.0115f}, + {0.0405f, 0.0861f, 0.0915f}, + {-0.0236f, -0.0185f, -0.0259f}, + {-0.0245f, 0.0250f, 0.1180f}, + {0.1008f, 0.0755f, -0.0421f}, + {-0.0515f, 0.0201f, 0.0011f}, + {0.0428f, -0.0012f, -0.0036f}, + {0.0817f, 0.0765f, 0.0749f}, + {-0.1264f, -0.0522f, -0.1103f}, + {-0.0280f, -0.0881f, -0.0499f}, + {-0.1262f, -0.0982f, -0.0778f}}; + +// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 +const float sd3_latent_rgb_proj[16][3] = { + {-0.0645f, 0.0177f, 0.1052f}, + {0.0028f, 0.0312f, 0.0650f}, + {0.1848f, 0.0762f, 0.0360f}, + {0.0944f, 0.0360f, 0.0889f}, + {0.0897f, 0.0506f, -0.0364f}, + {-0.0020f, 0.1203f, 0.0284f}, + {0.0855f, 0.0118f, 0.0283f}, + {-0.0539f, 0.0658f, 0.1047f}, + {-0.0057f, 0.0116f, 0.0700f}, + {-0.0412f, 0.0281f, -0.0039f}, + {0.1106f, 0.1171f, 0.1220f}, + {-0.0248f, 0.0682f, -0.0481f}, + {0.0815f, 0.0846f, 0.1207f}, + {-0.0120f, -0.0055f, -0.0867f}, + {-0.0749f, -0.0634f, -0.0456f}, + {-0.1418f, -0.1457f, -0.1259f}, +}; + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 +const float sdxl_latent_rgb_proj[4][3] = { + {0.3651f, 0.4232f, 0.4341f}, + {-0.2533f, -0.0042f, 0.1068f}, + {0.1076f, 0.1111f, -0.0362f}, + {-0.3165f, -0.2492f, -0.2188f}}; + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 +const float sd_latent_rgb_proj[4][3]{ + {0.3512f, 0.2297f, 0.3227f}, + {0.3250f, 0.4974f, 0.2350f}, + {-0.2829f, 0.1762f, 0.2721f}, + {-0.2120f, -0.2616f, -0.7177f}}; + +void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) { + size_t buffer_head = 0; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + size_t latent_id = (i * latents->nb[0] + j * latents->nb[1]); + float r = 0, g = 0, b = 0; + for (int d = 0; d < dim; d++) { + float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]); + r += value * latent_rgb_proj[d][0]; + g += value * latent_rgb_proj[d][1]; + b += value * latent_rgb_proj[d][2]; + } + + // change range + r = r * .5f + .5f; + g = g * .5f + .5f; + b = b * .5f + .5f; + + // clamp rgb values to [0,1] range + r = r >= 0 ? r <= 1 ? r : 1 : 0; + g = g >= 0 ? g <= 1 ? g : 1 : 0; + b = b >= 0 ? b <= 1 ? b : 1 : 0; + + buffer[buffer_head++] = (uint8_t)(r * 255); + buffer[buffer_head++] = (uint8_t)(g * 255); + buffer[buffer_head++] = (uint8_t)(b * 255); + } + } +} \ No newline at end of file diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index b5860cfd3..51aebcdd6 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -20,6 +20,9 @@ #define STB_IMAGE_STATIC #include "stb_image.h" +#include "latent-preview.h" +#include + // #define STB_IMAGE_WRITE_IMPLEMENTATION // #define STB_IMAGE_WRITE_STATIC // #include "stb_image_write.h" @@ -67,6 +70,14 @@ void calculate_alphas_cumprod(float* alphas_cumprod, } } +void suppress_pp(int step, int steps, float time, void* data) { + (void)step; + (void)steps; + (void)time; + (void)data; + return; +} + /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { @@ -164,7 +175,8 @@ class StableDiffusionGGML { bool diffusion_flash_attn, bool chroma_use_dit_mask, bool chroma_use_t5_mask, - int chroma_t5_mask_pad) { + int chroma_t5_mask_pad, + bool tae_preview_only) { use_tiny_autoencoder = taesd_path.size() > 0; #ifdef SD_USE_CUDA LOG_DEBUG("Using CUDA backend"); @@ -375,7 +387,7 @@ class StableDiffusionGGML { diffusion_model->alloc_params_buffer(); diffusion_model->get_param_tensors(tensors); - if (!use_tiny_autoencoder) { + if (!use_tiny_autoencoder || tae_preview_only) { if (vae_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_INFO("VAE Autoencoder: Using CPU backend"); vae_backend = ggml_backend_cpu_init(); @@ -385,7 +397,8 @@ class StableDiffusionGGML { first_stage_model = std::make_shared(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); - } else { + } + if (use_tiny_autoencoder) { tae_first_stage = std::make_shared(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version); } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); @@ -477,9 +490,10 @@ class StableDiffusionGGML { size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size(); size_t unet_params_mem_size = diffusion_model->get_params_buffer_size(); size_t vae_params_mem_size = 0; - if (!use_tiny_autoencoder) { + if (!use_tiny_autoencoder || tae_preview_only) { vae_params_mem_size = first_stage_model->get_params_buffer_size(); - } else { + } + if (use_tiny_autoencoder) { if (!tae_first_stage->load_from_file(taesd_path)) { return false; } @@ -631,6 +645,7 @@ class StableDiffusionGGML { LOG_DEBUG("finished loaded file"); ggml_free(ctx); + use_tiny_autoencoder = use_tiny_autoencoder && !tae_preview_only; return true; } @@ -817,7 +832,116 @@ class StableDiffusionGGML { return {c_crossattn, y, c_concat}; } - ggml_tensor* sample(ggml_context* work_ctx, + void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) { + sd_progress_cb_t cb = sd_get_progress_callback(); + void* cbd = sd_get_progress_callback_data(); + sd_set_progress_callback((sd_progress_cb_t)suppress_pp, NULL); + sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing); + sd_set_progress_callback(cb, cbd); + } + + void preview_image(ggml_context* work_ctx, + int step, + struct ggml_tensor* latents, + enum SDVersion version, + sd_preview_t preview_mode, + ggml_tensor* result, + std::function step_callback) { + const uint32_t channel = 3; + uint32_t width = latents->ne[0]; + uint32_t height = latents->ne[1]; + uint32_t dim = latents->ne[2]; + if (preview_mode == SD_PREVIEW_PROJ) { + const float(*latent_rgb_proj)[channel]; + + if (dim == 16) { + // 16 channels VAE -> Flux or SD3 + + if (sd_version_is_sd3(version)) { + latent_rgb_proj = sd3_latent_rgb_proj; + } else if (sd_version_is_flux(version)) { + latent_rgb_proj = flux_latent_rgb_proj; + } else { + LOG_WARN("No latent to RGB projection known for this model"); + // unknown model + return; + } + + } else if (dim == 4) { + // 4 channels VAE + if (sd_version_is_sdxl(version)) { + latent_rgb_proj = sdxl_latent_rgb_proj; + } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { + latent_rgb_proj = sd_latent_rgb_proj; + } else { + // unknown model + LOG_WARN("No latent to RGB projection known for this model"); + return; + } + } else { + LOG_WARN("No latent to RGB projection known for this model"); + // unknown latent space + return; + } + uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); + + preview_latent_image(data, latents, latent_rgb_proj, width, height, dim); + sd_image_t image = { + width, + height, + channel, + data}; + step_callback(step, image); + free(image.data); + } else { + if (preview_mode == SD_PREVIEW_VAE) { + ggml_tensor_scale(latents, 1.0f / scale_factor); + if (vae_tiling) { + // split latent in 32x32 tiles and compute in several steps + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + first_stage_model->compute(n_threads, in, true, &out); + }; + silent_tiling(latents, result, 8, 32, 0.5f, on_tiling); + + } else { + first_stage_model->compute(n_threads, latents, true, &result); + } + first_stage_model->free_compute_buffer(); + ggml_tensor_scale(latents, scale_factor); + + ggml_tensor_scale_output(result); + } else if (preview_mode == SD_PREVIEW_TAE) { + if (tae_first_stage == nullptr) { + LOG_WARN("TAE not found for preview"); + return; + } + if (vae_tiling) { + // split latent in 64x64 tiles and compute in several steps + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + tae_first_stage->compute(n_threads, in, true, &out); + }; + silent_tiling(latents, result, 8, 64, 0.5f, on_tiling); + } else { + tae_first_stage->compute(n_threads, latents, true, &result); + } + tae_first_stage->free_compute_buffer(); + } else { + return; + } + ggml_tensor_clamp(result, 0.0f, 1.0f); + sd_image_t image = { + width * 8, + height * 8, + channel, + sd_tensor_to_image(result)}; + ggml_tensor_scale(result, 0); + step_callback(step, image); + free(image.data); + } + } + + ggml_tensor* + sample(ggml_context* work_ctx, ggml_tensor* init_latent, ggml_tensor* noise, SDCondition cond, @@ -880,6 +1004,16 @@ class StableDiffusionGGML { } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* preview_tensor = NULL; + auto sd_preview_mode = sd_get_preview_mode(); + if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) { + preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, + (denoised->ne[0] * 8), + (denoised->ne[1] * 8), + 3, + denoised->ne[3]); + } + auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { if (step == 1) { pretty_progress(0, (int)steps, 0); @@ -1008,10 +1142,6 @@ class StableDiffusionGGML { vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; } int64_t t1 = ggml_time_us(); - if (step > 0) { - pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); - // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); - } if (noise_mask != nullptr) { for (int64_t x = 0; x < denoised->ne[0]; x++) { for (int64_t y = 0; y < denoised->ne[1]; y++) { @@ -1024,7 +1154,17 @@ class StableDiffusionGGML { } } } - + if (step > 0) { + pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); + // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); + } + auto sd_preview_cb = sd_get_preview_callback(); + auto sd_preview_mode = sd_get_preview_mode(); + if (sd_preview_cb != NULL) { + if (step % sd_get_preview_interval() == 0) { + preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb); + } + } return denoised; }; @@ -1170,7 +1310,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str, bool diffusion_flash_attn, bool chroma_use_dit_mask, bool chroma_use_t5_mask, - int chroma_t5_mask_pad) { + int chroma_t5_mask_pad, + bool tae_preview_only) { sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); if (sd_ctx == NULL) { return NULL; @@ -1215,7 +1356,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str, diffusion_flash_attn, chroma_use_dit_mask, chroma_use_t5_mask, - chroma_t5_mask_pad)) { + chroma_t5_mask_pad, + tae_preview_only)) { delete sd_ctx->sd; sd_ctx->sd = NULL; free(sd_ctx); @@ -1495,27 +1637,31 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); } - struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, - x_t, - noise, - cond, - uncond, - image_hint, - control_strength, - cfg_scale, - cfg_scale, - guidance, - eta, - sample_method, - sigmas, - start_merge_step, - id_cond, - ref_latents, - skip_layers, - slg_scale, - skip_layer_start, - skip_layer_end, - noise_mask); + struct ggml_tensor* x_0; + std::thread t([&]() { + x_0 = sd_ctx->sd->sample(work_ctx, + x_t, + noise, + cond, + uncond, + image_hint, + control_strength, + cfg_scale, + cfg_scale, + guidance, + eta, + sample_method, + sigmas, + start_merge_step, + id_cond, + ref_latents, + skip_layers, + slg_scale, + skip_layer_start, + skip_layer_end, + noise_mask); + }); + t.join(); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1629,6 +1775,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } + auto sd_preview_mode = sd_get_preview_mode(); + if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) { + params.mem_size *= 2; + } params.mem_size += width * height * 3 * sizeof(float); params.mem_size *= batch_count; params.mem_buffer = NULL; @@ -1675,7 +1825,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, skip_layers_vec, slg_scale, skip_layer_start, - skip_layer_end); + skip_layer_end, + NULL); size_t t1 = ggml_time_ms(); @@ -1959,7 +2110,9 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sample_method, sigmas, -1, - SDCondition(NULL, NULL, NULL)); + SDCondition(NULL, NULL, NULL), + {}, {}, + 0, 0.001, 0.2, NULL); int64_t t2 = ggml_time_ms(); LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000); diff --git a/stable-diffusion.h b/stable-diffusion.h index b4d6fc327..4fc994ea1 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -112,13 +112,13 @@ enum sd_log_level_t { SD_LOG_ERROR }; -typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); -typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); - -SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); -SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); -SD_API int32_t get_num_physical_cores(); -SD_API const char* sd_get_system_info(); +enum sd_preview_t { + SD_PREVIEW_NONE, + SD_PREVIEW_PROJ, + SD_PREVIEW_TAE, + SD_PREVIEW_VAE, + N_PREVIEWS +}; typedef struct { uint32_t width; @@ -127,6 +127,17 @@ typedef struct { uint8_t* data; } sd_image_t; +typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); +typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); +typedef void (*sd_preview_cb_t)(int, sd_image_t); + + +SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); +SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); +SD_API void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode, int interval); +SD_API int32_t get_num_physical_cores(); +SD_API const char* sd_get_system_info(); + typedef struct sd_ctx_t sd_ctx_t; SD_API sd_ctx_t* new_sd_ctx(const char* model_path, @@ -153,7 +164,8 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path, bool diffusion_flash_attn, bool chroma_use_dit_mask, bool chroma_use_t5_mask, - int chroma_t5_mask_pad); + int chroma_t5_mask_pad, + bool tae_preview_only); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); diff --git a/util.cpp b/util.cpp index 631c12066..6b29aecb9 100644 --- a/util.cpp +++ b/util.cpp @@ -247,6 +247,10 @@ int32_t get_num_physical_cores() { static sd_progress_cb_t sd_progress_cb = NULL; void* sd_progress_cb_data = NULL; +static sd_preview_cb_t sd_preview_cb = NULL; +sd_preview_t sd_preview_mode = SD_PREVIEW_NONE; +int sd_preview_interval = 1; + std::u32string utf8_to_utf32(const std::string& utf8_str) { std::wstring_convert, char32_t> converter; return converter.from_bytes(utf8_str); @@ -420,6 +424,29 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) { sd_progress_cb = cb; sd_progress_cb_data = data; } +void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode = SD_PREVIEW_PROJ, int interval = 1) { + sd_preview_cb = cb; + sd_preview_mode = mode; + sd_preview_interval = interval; +} + +sd_preview_cb_t sd_get_preview_callback() { + return sd_preview_cb; +} + +sd_preview_t sd_get_preview_mode() { + return sd_preview_mode; +} +int sd_get_preview_interval() { + return sd_preview_interval; +} + +sd_progress_cb_t sd_get_progress_callback() { + return sd_progress_cb; +} +void* sd_get_progress_callback_data() { + return sd_progress_cb_data; +} const char* sd_get_system_info() { static char buffer[1024]; std::stringstream ss; diff --git a/util.h b/util.h index 14fa812e5..36a2e18af 100644 --- a/util.h +++ b/util.h @@ -54,6 +54,13 @@ std::string trim(const std::string& s); std::vector> parse_prompt_attention(const std::string& text); +sd_progress_cb_t sd_get_progress_callback(); +void* sd_get_progress_callback_data(); + +sd_preview_cb_t sd_get_preview_callback(); +sd_preview_t sd_get_preview_mode(); +int sd_get_preview_interval(); + #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__) From c10fe89c2a9a7738aa60799c6254c1663de77e35 Mon Sep 17 00:00:00 2001 From: RodrigoSantiago Date: Sun, 6 Jul 2025 03:15:52 -0300 Subject: [PATCH 2/3] Txt2Img upgrade --- denoiser.hpp | 53 ++++++--- examples/cli/main.cpp | 68 +++++++++++- ggml_extend.hpp | 6 +- model.cpp | 2 +- stable-diffusion.cpp | 243 ++++++++++++++++++++++++++++++------------ stable-diffusion.h | 6 +- util.cpp | 6 +- util.h | 2 +- 8 files changed, 288 insertions(+), 98 deletions(-) diff --git a/denoiser.hpp b/denoiser.hpp index 2bd0b939a..b5359092d 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -489,7 +489,7 @@ struct FluxFlowDenoiser : public Denoiser { } }; -typedef std::function denoise_cb_t; +typedef std::function denoise_cb_t; // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t static void sample_k_diffusion(sample_method_t method, @@ -500,6 +500,8 @@ static void sample_k_diffusion(sample_method_t method, std::shared_ptr rng, float eta) { size_t steps = sigmas.size() - 1; + bool early_stop = false; + // sample_euler_ancestral switch (method) { case EULER_A: { @@ -510,7 +512,8 @@ static void sample_k_diffusion(sample_method_t method, float sigma = sigmas[i]; // denoise - ggml_tensor* denoised = model(x, sigma, i + 1); + ggml_tensor* denoised = model(x, sigma, i + 1, early_stop); + if (early_stop) break; // d = (x - denoised) / sigma { @@ -563,7 +566,8 @@ static void sample_k_diffusion(sample_method_t method, float sigma = sigmas[i]; // denoise - ggml_tensor* denoised = model(x, sigma, i + 1); + ggml_tensor* denoised = model(x, sigma, i + 1, early_stop); + if (early_stop) break; // d = (x - denoised) / sigma { @@ -594,7 +598,8 @@ static void sample_k_diffusion(sample_method_t method, for (int i = 0; i < steps; i++) { // denoise - ggml_tensor* denoised = model(x, sigmas[i], -(i + 1)); + ggml_tensor* denoised = model(x, sigmas[i], -(i + 1), early_stop); + if (early_stop) break; // d = (x - denoised) / sigma { @@ -628,7 +633,9 @@ static void sample_k_diffusion(sample_method_t method, vec_x2[j] = vec_x[j] + vec_d[j] * dt; } - ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1); + ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1, early_stop); + if (early_stop) break; + float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1]; @@ -644,7 +651,8 @@ static void sample_k_diffusion(sample_method_t method, for (int i = 0; i < steps; i++) { // denoise - ggml_tensor* denoised = model(x, sigmas[i], i + 1); + ggml_tensor* denoised = model(x, sigmas[i], i + 1, early_stop); + if (early_stop) break; // d = (x - denoised) / sigma { @@ -680,7 +688,9 @@ static void sample_k_diffusion(sample_method_t method, vec_x2[j] = vec_x[j] + vec_d[j] * dt_1; } - ggml_tensor* denoised = model(x2, sigma_mid, i + 1); + ggml_tensor* denoised = model(x2, sigma_mid, i + 1, early_stop); + if (early_stop) break; + float* vec_denoised = (float*)denoised->data; for (int j = 0; j < ggml_nelements(x); j++) { float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid; @@ -697,7 +707,8 @@ static void sample_k_diffusion(sample_method_t method, for (int i = 0; i < steps; i++) { // denoise - ggml_tensor* denoised = model(x, sigmas[i], i + 1); + ggml_tensor* denoised = model(x, sigmas[i], i + 1, early_stop); + if (early_stop) break; // get_ancestral_step float sigma_up = std::min(sigmas[i + 1], @@ -741,7 +752,8 @@ static void sample_k_diffusion(sample_method_t method, vec_x2[j] = (sigma_fn(s) / sigma_fn(t)) * vec_x[j] - (exp(-h * 0.5f) - 1) * vec_denoised[j]; } - ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1); + ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1, early_stop); + if (early_stop) break; // Second half-step for (int j = 0; j < ggml_nelements(x); j++) { @@ -771,7 +783,8 @@ static void sample_k_diffusion(sample_method_t method, for (int i = 0; i < steps; i++) { // denoise - ggml_tensor* denoised = model(x, sigmas[i], i + 1); + ggml_tensor* denoised = model(x, sigmas[i], i + 1, early_stop); + if (early_stop) break; float t = t_fn(sigmas[i]); float t_next = t_fn(sigmas[i + 1]); @@ -810,7 +823,8 @@ static void sample_k_diffusion(sample_method_t method, for (int i = 0; i < steps; i++) { // denoise - ggml_tensor* denoised = model(x, sigmas[i], i + 1); + ggml_tensor* denoised = model(x, sigmas[i], i + 1, early_stop); + if (early_stop) break; float t = t_fn(sigmas[i]); float t_next = t_fn(sigmas[i + 1]); @@ -860,7 +874,9 @@ static void sample_k_diffusion(sample_method_t method, float* vec_x_next = (float*)x_next->data; // Denoising step - ggml_tensor* denoised = model(x_cur, sigma, i + 1); + ggml_tensor* denoised = model(x_cur, sigma, i + 1, early_stop); + if (early_stop) break; + float* vec_denoised = (float*)denoised->data; // d_cur = (x_cur - denoised) / sigma struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur); @@ -931,7 +947,9 @@ static void sample_k_diffusion(sample_method_t method, float t_next = sigmas[i + 1]; // Denoising step - ggml_tensor* denoised = model(x, sigma, i + 1); + ggml_tensor* denoised = model(x, sigma, i + 1, early_stop); + if (early_stop) break; + float* vec_denoised = (float*)denoised->data; struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x); float* vec_d_cur = (float*)d_cur->data; @@ -1003,7 +1021,8 @@ static void sample_k_diffusion(sample_method_t method, float sigma = sigmas[i]; // denoise - ggml_tensor* denoised = model(x, sigma, i + 1); + ggml_tensor* denoised = model(x, sigma, i + 1, early_stop); + if (early_stop) break; // x = denoised { @@ -1129,7 +1148,8 @@ static void sample_k_diffusion(sample_method_t method, // defined in Karras et al. (2022), p. 3, Table 1 and // p. 8 (7), compare also p. 38 (226) therein. struct ggml_tensor* model_output = - model(x, sigma, i + 1); + model(x, sigma, i + 1, early_stop); + if (early_stop) break; // Here model_output is still the k-diffusion denoiser // output, not the U-net output F_theta(c_in(sigma) x; // ...) in Karras et al. (2022), whereas Diffusers' @@ -1288,7 +1308,8 @@ static void sample_k_diffusion(sample_method_t method, } } struct ggml_tensor* model_output = - model(x, sigma, i + 1); + model(x, sigma, i + 1, early_stop); + if (early_stop) break; { float* vec_x = (float*)x->data; float* vec_model_output = diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index d06040445..381e2bef2 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -61,6 +61,13 @@ const char* modes_str[] = { "convert", }; +const char* previews_str[] = { + "none", + "proj", + "tae", + "vae", +}; + enum SDMode { TXT2IMG, IMG2IMG, @@ -136,6 +143,11 @@ struct SDParams { bool chroma_use_dit_mask = true; bool chroma_use_t5_mask = false; int chroma_t5_mask_pad = 1; + + sd_preview_t preview_method = SD_PREVIEW_NONE; + int preview_interval = 1; + std::string preview_path = "preview.png"; + bool taesd_preview = false; }; void print_params(SDParams params) { @@ -192,6 +204,8 @@ void print_params(SDParams params) { printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false"); printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false"); printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad); + printf(" preview_mode: %s\n", previews_str[params.preview_method]); + printf(" preview_interval: %d\n", params.preview_interval); } void print_usage(int argc, const char* argv[]) { @@ -208,7 +222,8 @@ void print_usage(int argc, const char* argv[]) { printf(" --clip_g path to the clip-g text encoder\n"); printf(" --t5xxl path to the the t5xxl text encoder\n"); printf(" --vae [VAE] path to vae\n"); - printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); + printf(" --taesd [TAESD] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); + printf(" --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[SD_PREVIEW_TAE]); printf(" --control-net [CONTROL_PATH] path to control net model\n"); printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n"); printf(" --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings\n"); @@ -262,6 +277,10 @@ void print_usage(int argc, const char* argv[]) { printf(" --chroma-disable-dit-mask disable dit mask for chroma\n"); printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n"); printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n"); + printf(" --preview {%s,%s,%s,%s} preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[SD_PREVIEW_NONE]); + printf(" %s is the fastest\n", previews_str[SD_PREVIEW_PROJ]); + printf(" --preview-interval [N] How often to save the image preview"); + printf(" --preview-path [PATH} path to write preview image to (default: ./preview.png)\n"); printf(" -v, --verbose print extra info\n"); } @@ -525,6 +544,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { params.diffusion_flash_attn = true; // can reduce MEM significantly } else if (arg == "--canny") { params.canny_preprocess = true; + } else if (arg == "--taesd-preview-only") { + params.taesd_preview = true; } else if (arg == "-b" || arg == "--batch-count") { if (++i >= argc) { invalid_arg = true; @@ -663,7 +684,36 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.chroma_t5_mask_pad = std::stoi(argv[i]); - } else { + } else if (arg == "--preview") { + if (++i >= argc) { + invalid_arg = true; + break; + } + const char* preview = argv[i]; + int preview_method = -1; + for (int m = 0; m < N_PREVIEWS; m++) { + if (!strcmp(preview, previews_str[m])) { + preview_method = m; + } + } + if (preview_method == -1) { + invalid_arg = true; + break; + } + params.preview_method = (sd_preview_t)preview_method; + } else if (arg == "--preview-interval") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.preview_interval = std::stoi(argv[i]); + } else if (arg == "--preview-path") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.preview_path = argv[i]; + } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); print_usage(argc, argv); exit(1); @@ -827,12 +877,20 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { fflush(out_stream); } +const char* preview_path; + +void step_callback(int step, int batch, sd_image_t image) { + stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0); +} + int main(int argc, const char* argv[]) { SDParams params; parse_args(argc, argv, params); + preview_path = params.preview_path.c_str(); sd_set_log_callback(sd_log_cb, (void*)¶ms); + sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval); if (params.verbose) { print_params(params); @@ -975,7 +1033,8 @@ int main(int argc, const char* argv[]) { params.diffusion_flash_attn, params.chroma_use_dit_mask, params.chroma_use_t5_mask, - params.chroma_t5_mask_pad); + params.chroma_t5_mask_pad, + params.taesd_preview); if (sd_ctx == NULL) { printf("new_sd_ctx_t failed\n"); @@ -1042,7 +1101,8 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end); + params.skip_layer_end, + false); } else if (params.mode == IMG2IMG || params.mode == IMG2VID) { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 424ebd5e4..9cdbc4536 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -635,7 +635,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const on_processing(input_tile, NULL, true); int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap); LOG_DEBUG("processing %i tiles", num_tiles); - pretty_progress(1, num_tiles, 0.0f); + pretty_progress(1, num_tiles, 0.0f, 0); int tile_count = 1; bool last_y = false, last_x = false; float last_time = 0.0f; @@ -655,13 +655,13 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const ggml_merge_tensor_2d(output_tile, output, x * scale, y * scale, tile_overlap * scale); int64_t t2 = ggml_time_ms(); last_time = (t2 - t1) / 1000.0f; - pretty_progress(tile_count, num_tiles, last_time); + pretty_progress(tile_count, num_tiles, last_time, 0); tile_count++; } last_x = false; } if (tile_count < num_tiles) { - pretty_progress(num_tiles, num_tiles, last_time); + pretty_progress(num_tiles, num_tiles, last_time, 0); } ggml_free(tiles_ctx); } diff --git a/model.cpp b/model.cpp index 333f5d36a..7617e0f72 100644 --- a/model.cpp +++ b/model.cpp @@ -1923,7 +1923,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend } } int64_t t2 = ggml_time_ms(); - pretty_progress(++tensor_count, processed_tensor_storages.size(), (t2 - t1) / 1000.0f); + pretty_progress(++tensor_count, processed_tensor_storages.size(), (t2 - t1) / 1000.0f, 0); t1 = t2; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 51aebcdd6..16981ec30 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -22,6 +22,7 @@ #include "latent-preview.h" #include +#include // #define STB_IMAGE_WRITE_IMPLEMENTATION // #define STB_IMAGE_WRITE_STATIC @@ -78,6 +79,94 @@ void suppress_pp(int step, int steps, float time, void* data) { return; } +class StableThreads { +private: + static inline std::mutex mtx; + static inline std::condition_variable cv; + static inline std::thread::id current_index = {}; + static inline std::vector thread_indices; + static inline std::unordered_map thread_cancel; + static inline std::unordered_map batch_thread; +public: + + static int count() { + std::unique_lock lock(mtx); + return thread_indices.size(); + } + + static void start() { + std::unique_lock lock(mtx); + current_index = thread_indices[0]; + cv.notify_all(); + } + + static void clear() { + std::unique_lock lock(mtx); + thread_indices.clear(); + thread_cancel.clear(); + batch_thread.clear(); + } + + static void register_thread(int id) { + std::unique_lock lock(mtx); + thread_indices.push_back(std::this_thread::get_id()); + batch_thread[id] = std::this_thread::get_id(); + + cv.wait(lock, []() { + return std::this_thread::get_id() == current_index; + }); + } + + static void remove_thread() { + std::unique_lock lock(mtx); + + auto it = std::find(thread_indices.begin(), thread_indices.end(), std::this_thread::get_id()); + long long indice; + if (it != thread_indices.end()) { + indice = std::distance(thread_indices.begin(), it); + thread_indices.erase(it); + } else { + indice = 0; + } + if (thread_indices.empty()) { + return; + } + current_index = thread_indices[indice % thread_indices.size()]; + cv.notify_all(); + } + + static void move_next() { + std::unique_lock lock(mtx); + if (thread_indices.size() < 2) + return; + + auto it = std::find(thread_indices.begin(), thread_indices.end(), std::this_thread::get_id()); + long long indice; + if (it != thread_indices.end()) { + indice = std::distance(thread_indices.begin(), it); + } else { + indice = 0; + } + + current_index = thread_indices[(indice + 1) % thread_indices.size()]; + cv.notify_all(); + cv.wait(lock, []() { + return std::this_thread::get_id() == current_index; + }); + } + + static bool is_request_cancel() { + std::unique_lock lock(mtx); + return thread_cancel.find(std::this_thread::get_id()) != thread_cancel.end(); + } + + static void request_cancel(int id) { + std::unique_lock lock(mtx); + if (batch_thread.find(id) != batch_thread.end()) { + thread_cancel[batch_thread[id]] = true; + } + } +}; /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { @@ -842,11 +931,12 @@ class StableDiffusionGGML { void preview_image(ggml_context* work_ctx, int step, + int batch, struct ggml_tensor* latents, enum SDVersion version, sd_preview_t preview_mode, ggml_tensor* result, - std::function step_callback) { + std::function step_callback) { const uint32_t channel = 3; uint32_t width = latents->ne[0]; uint32_t height = latents->ne[1]; @@ -891,7 +981,7 @@ class StableDiffusionGGML { height, channel, data}; - step_callback(step, image); + step_callback(step, batch, image); free(image.data); } else { if (preview_mode == SD_PREVIEW_VAE) { @@ -920,7 +1010,7 @@ class StableDiffusionGGML { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { tae_first_stage->compute(n_threads, in, true, &out); }; - silent_tiling(latents, result, 8, 64, 0.5f, on_tiling); + silent_tiling(latents, result, 8, 32, 0.5f, on_tiling); } else { tae_first_stage->compute(n_threads, latents, true, &result); } @@ -935,7 +1025,7 @@ class StableDiffusionGGML { channel, sd_tensor_to_image(result)}; ggml_tensor_scale(result, 0); - step_callback(step, image); + step_callback(step, batch, image); free(image.data); } } @@ -961,7 +1051,8 @@ class StableDiffusionGGML { float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr) { + ggml_tensor* noise_mask = nullptr, + int batchId = 0) { LOG_DEBUG("Sample"); struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); @@ -1014,9 +1105,9 @@ class StableDiffusionGGML { denoised->ne[3]); } - auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { + auto denoise = [&](ggml_tensor* input, float sigma, int step, bool& early_stop) -> ggml_tensor* { if (step == 1) { - pretty_progress(0, (int)steps, 0); + pretty_progress(0, (int)steps, 0, 0); } int64_t t0 = ggml_time_us(); @@ -1155,16 +1246,21 @@ class StableDiffusionGGML { } } if (step > 0) { - pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); + pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f, batchId); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); } auto sd_preview_cb = sd_get_preview_callback(); auto sd_preview_mode = sd_get_preview_mode(); if (sd_preview_cb != NULL) { - if (step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb); + if (step == 1 || step >= steps || step % sd_get_preview_interval() == 0) { + preview_image(work_ctx, step, batchId, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb); + LOG_INFO("Preview Rendered"); + StableThreads::move_next(); } } + if (!early_stop) { + early_stop = StableThreads::is_request_cancel(); + } return denoised; }; @@ -1374,6 +1470,10 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) { free(sd_ctx); } +void sd_request_cancel_batch(int id) { + StableThreads::request_cancel(id); +} + sd_image_t* generate_image(sd_ctx_t* sd_ctx, struct ggml_context* work_ctx, ggml_tensor* init_latent, @@ -1399,7 +1499,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL) { + ggml_tensor* masked_image = NULL, + bool skip_image_gen = false) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1576,7 +1677,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, } // Sample - std::vector final_latents; // collect latents to decode int C = 4; if (sd_version_is_sd3(sd_ctx->sd->version)) { C = 16; @@ -1619,26 +1719,45 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, } else { noise_mask = masked_image; } + + sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t)); + if (result_images == NULL) { + ggml_free(work_ctx); + return NULL; + } + + for (size_t i = 0; i < batch_count; i++) { + result_images[i].width = width; + result_images[i].height = height; + result_images[i].channel = 3; + result_images[i].data = nullptr; + } + + StableThreads::clear(); + std::vector threads; for (int b = 0; b < batch_count; b++) { - int64_t sampling_start = ggml_time_ms(); - int64_t cur_seed = seed + b; - LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed); + threads.emplace_back(([&, b]() { + StableThreads::register_thread(b); + if (StableThreads::is_request_cancel()) return; - sd_ctx->sd->rng->manual_seed(cur_seed); - struct ggml_tensor* x_t = init_latent; - struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); - ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); + struct ggml_tensor* x_0; + int64_t sampling_start = ggml_time_ms(); + int64_t cur_seed = seed + b; + LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed); - int start_merge_step = -1; - if (sd_ctx->sd->stacked_id) { - start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps); - // if (start_merge_step > 30) - // start_merge_step = 30; - LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); - } + sd_ctx->sd->rng->manual_seed(cur_seed); + struct ggml_tensor* x_t = init_latent; + struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); + + int start_merge_step = -1; + if (sd_ctx->sd->stacked_id) { + start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps); + // if (start_merge_step > 30) + // start_merge_step = 30; + LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); + } - struct ggml_tensor* x_0; - std::thread t([&]() { x_0 = sd_ctx->sd->sample(work_ctx, x_t, noise, @@ -1659,54 +1778,40 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - noise_mask); - }); - t.join(); + noise_mask, b); + + int64_t sampling_end = ggml_time_ms(); + LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); + if (!StableThreads::is_request_cancel()) { + if (!skip_image_gen) { + struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0); + result_images[b].data = sd_tensor_to_image(img); + int64_t decode_end = ggml_time_ms(); + LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", b, (decode_end - sampling_end) * 1.0f / 1000); + auto callback = sd_get_preview_callback(); + callback(-1, b, result_images[b]); + } + } - // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); - // print_ggml_tensor(x_0); - int64_t sampling_end = ggml_time_ms(); - LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); - final_latents.push_back(x_0); + StableThreads::remove_thread(); + })); + } + while (StableThreads::count() < batch_count) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); } + StableThreads::start(); + for (auto& t : threads) { + t.join(); + } + StableThreads::clear(); if (sd_ctx->sd->free_params_immediately) { sd_ctx->sd->diffusion_model->free_params_buffer(); } - int64_t t3 = ggml_time_ms(); - LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000); - // Decode to image - LOG_INFO("decoding %zu latents", final_latents.size()); - std::vector decoded_images; // collect decoded images - for (size_t i = 0; i < final_latents.size(); i++) { - t1 = ggml_time_ms(); - struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */); - // print_ggml_tensor(img); - if (img != NULL) { - decoded_images.push_back(img); - } - int64_t t2 = ggml_time_ms(); - LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000); - } - - int64_t t4 = ggml_time_ms(); - LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000); if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { sd_ctx->sd->first_stage_model->free_params_buffer(); } - sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t)); - if (result_images == NULL) { - ggml_free(work_ctx); - return NULL; - } - - for (size_t i = 0; i < decoded_images.size(); i++) { - result_images[i].width = width; - result_images[i].height = height; - result_images[i].channel = 3; - result_images[i].data = sd_tensor_to_image(decoded_images[i]); - } ggml_free(work_ctx); return result_images; @@ -1757,7 +1862,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, size_t skip_layers_count = 0, float slg_scale = 0, float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { + float skip_layer_end = 0.2, + bool skip_image_gen = false) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1826,7 +1932,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - NULL); + NULL, + skip_image_gen); size_t t1 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index 4fc994ea1..2e18f6795 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -129,7 +129,7 @@ typedef struct { typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); -typedef void (*sd_preview_cb_t)(int, sd_image_t); +typedef void (*sd_preview_cb_t)(int, int, sd_image_t); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); @@ -137,6 +137,7 @@ SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); SD_API void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode, int interval); SD_API int32_t get_num_physical_cores(); SD_API const char* sd_get_system_info(); +SD_API void sd_request_cancel_batch(int id); typedef struct sd_ctx_t sd_ctx_t; @@ -191,7 +192,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, size_t skip_layers_count, float slg_scale, float skip_layer_start, - float skip_layer_end); + float skip_layer_end, + bool skip_image_gen); SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, diff --git a/util.cpp b/util.cpp index 6b29aecb9..27b29f3eb 100644 --- a/util.cpp +++ b/util.cpp @@ -344,7 +344,7 @@ sd_image_t* preprocess_id_image(sd_image_t* img) { return resized; } -void pretty_progress(int step, int steps, float time) { +void pretty_progress(int step, int steps, float time, int batch) { if (sd_progress_cb) { sd_progress_cb(step, steps, time, sd_progress_cb_data); return; @@ -365,8 +365,8 @@ void pretty_progress(int step, int steps, float time) { } } progress += "|"; - printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K", - progress.c_str(), step, steps, + printf(time > 1.0f ? "\r%s %i/%i (%i) - %.2fs/it\n" : "\r%s %i/%i (%i) - %.2fit/s\n", + progress.c_str(), step, steps, batch, time > 1.0f || time == 0 ? time : (1.0f / time)); fflush(stdout); // for linux if (step == steps) { diff --git a/util.h b/util.h index 36a2e18af..265d0e9d1 100644 --- a/util.h +++ b/util.h @@ -46,7 +46,7 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size); std::string path_join(const std::string& p1, const std::string& p2); std::vector splitString(const std::string& str, char delimiter); -void pretty_progress(int step, int steps, float time); +void pretty_progress(int step, int steps, float time, int batch); void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...); From 22f4c228d6a9d661f2adddfe6baf6fdc23ce6e5f Mon Sep 17 00:00:00 2001 From: RodrigoSantiago Date: Wed, 9 Jul 2025 12:26:52 -0300 Subject: [PATCH 3/3] Custom updates --- examples/cli/main.cpp | 5 +++-- ggml_extend.hpp | 4 ++-- stable-diffusion.cpp | 48 ++++++++++++++++++++++++++++++++----------- stable-diffusion.h | 5 +++-- 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 381e2bef2..5877e8196 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -879,7 +879,7 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { const char* preview_path; -void step_callback(int step, int batch, sd_image_t image) { +void step_callback(int step, int steps, int batch, sd_image_t image) { stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0); } @@ -1171,7 +1171,8 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end); + params.skip_layer_end, + false); } } else { // EDIT results = edit(sd_ctx, diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 9cdbc4536..2724c095a 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -620,7 +620,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const params.mem_buffer = NULL; params.no_alloc = false; - LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); + LOG_INFO("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); // draft context struct ggml_context* tiles_ctx = ggml_init(params); @@ -1142,7 +1142,7 @@ struct GGMLRunner { // compute the required memory size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0); - LOG_DEBUG("%s compute buffer size: %.2f MB(%s)", + LOG_INFO("%s compute buffer size: %.2f MB(%s)", get_desc().c_str(), compute_buffer_size / 1024.0 / 1024.0, ggml_backend_is_cpu(backend) ? "RAM" : "VRAM"); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 16981ec30..d9f272218 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -488,7 +488,14 @@ class StableDiffusionGGML { first_stage_model->get_param_tensors(tensors, "first_stage_model"); } if (use_tiny_autoencoder) { - tae_first_stage = std::make_shared(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version); + ggml_backend_t tae_backend; + if (vae_on_cpu && !ggml_backend_is_cpu(backend)) { + LOG_INFO("TAE Autoencoder: Using CPU backend"); + tae_backend = ggml_backend_cpu_init(); + } else { + tae_backend = ggml_backend_cpu_init(); + } + tae_first_stage = std::make_shared(tae_backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version); } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); @@ -931,12 +938,13 @@ class StableDiffusionGGML { void preview_image(ggml_context* work_ctx, int step, + int steps, int batch, struct ggml_tensor* latents, enum SDVersion version, sd_preview_t preview_mode, ggml_tensor* result, - std::function step_callback) { + std::function step_callback) { const uint32_t channel = 3; uint32_t width = latents->ne[0]; uint32_t height = latents->ne[1]; @@ -981,7 +989,7 @@ class StableDiffusionGGML { height, channel, data}; - step_callback(step, batch, image); + step_callback(step, steps, batch, image); free(image.data); } else { if (preview_mode == SD_PREVIEW_VAE) { @@ -1005,12 +1013,12 @@ class StableDiffusionGGML { LOG_WARN("TAE not found for preview"); return; } - if (vae_tiling) { + if (vae_tiling && false) { // split latent in 64x64 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { tae_first_stage->compute(n_threads, in, true, &out); }; - silent_tiling(latents, result, 8, 32, 0.5f, on_tiling); + silent_tiling(latents, result, 8, 64, 0.5f, on_tiling); } else { tae_first_stage->compute(n_threads, latents, true, &result); } @@ -1025,7 +1033,7 @@ class StableDiffusionGGML { channel, sd_tensor_to_image(result)}; ggml_tensor_scale(result, 0); - step_callback(step, batch, image); + step_callback(step, steps, batch, image); free(image.data); } } @@ -1253,14 +1261,17 @@ class StableDiffusionGGML { auto sd_preview_mode = sd_get_preview_mode(); if (sd_preview_cb != NULL) { if (step == 1 || step >= steps || step % sd_get_preview_interval() == 0) { - preview_image(work_ctx, step, batchId, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb); - LOG_INFO("Preview Rendered"); + preview_image(work_ctx, step, (int)steps, batchId, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb); + LOG_INFO("Preview Rendered %d ", batchId); StableThreads::move_next(); } } if (!early_stop) { early_stop = StableThreads::is_request_cancel(); } + if (early_stop) { + LOG_INFO("Image cancelled %d ", batchId); + } return denoised; }; @@ -1738,7 +1749,10 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, for (int b = 0; b < batch_count; b++) { threads.emplace_back(([&, b]() { StableThreads::register_thread(b); - if (StableThreads::is_request_cancel()) return; + if (StableThreads::is_request_cancel()) { + StableThreads::remove_thread(); + return; + } struct ggml_tensor* x_0; int64_t sampling_start = ggml_time_ms(); @@ -1784,21 +1798,25 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000); if (!StableThreads::is_request_cancel()) { if (!skip_image_gen) { + LOG_INFO("VAE Rendering %d ", b); struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0); result_images[b].data = sd_tensor_to_image(img); int64_t decode_end = ggml_time_ms(); + LOG_INFO("Image Rendered %d ", b); LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", b, (decode_end - sampling_end) * 1.0f / 1000); auto callback = sd_get_preview_callback(); - callback(-1, b, result_images[b]); + callback(-1, -1, b, result_images[b]); } } StableThreads::remove_thread(); })); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); } while (StableThreads::count() < batch_count) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } + LOG_INFO("Start generate"); StableThreads::start(); for (auto& t : threads) { t.join(); @@ -1967,7 +1985,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, size_t skip_layers_count = 0, float slg_scale = 0, float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { + float skip_layer_end = 0.2, + bool skip_image_gen = false) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1985,6 +2004,10 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } + auto sd_preview_mode = sd_get_preview_mode(); + if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) { + params.mem_size *= 2; + } params.mem_size += width * height * 3 * sizeof(float) * 3; params.mem_size *= batch_count; params.mem_buffer = NULL; @@ -2115,7 +2138,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - masked_image); + masked_image, + skip_image_gen); size_t t2 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index 2e18f6795..a6ac4703d 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -129,7 +129,7 @@ typedef struct { typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); -typedef void (*sd_preview_cb_t)(int, int, sd_image_t); +typedef void (*sd_preview_cb_t)(int, int, int, sd_image_t); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); @@ -220,7 +220,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, size_t skip_layers_count, float slg_scale, float skip_layer_start, - float skip_layer_end); + float skip_layer_end, + bool skip_image_gen); SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sd_image_t init_image,