diff --git a/.gitignore b/.gitignore
index 38fe570df..e1c30e700 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,7 @@ test/
*.gguf
output*.png
models*
-*.log
\ No newline at end of file
+*.log
+preview.png
+cmake-build-debug/
+cmake-build-debug-mingw/
\ No newline at end of file
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 000000000..13566b81b
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml
new file mode 100644
index 000000000..f60388162
--- /dev/null
+++ b/.idea/codeStyles/Project.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml
new file mode 100644
index 000000000..79ee123c2
--- /dev/null
+++ b/.idea/codeStyles/codeStyleConfig.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 000000000..79b3c9483
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 000000000..c930abe40
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/stable-diffusion.cpp.iml b/.idea/stable-diffusion.cpp.iml
new file mode 100644
index 000000000..f08604bb6
--- /dev/null
+++ b/.idea/stable-diffusion.cpp.iml
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 000000000..2b99a06e0
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README.md b/README.md
index 4720dc29c..8ac08a051 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ Inference of Stable Diffusion and Flux in pure C/C++
- Linux
- Mac OS
- Windows
- - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
+ - Android (via Termux)
### TODO
@@ -392,12 +392,10 @@ Using formats of different precisions will yield results of varying quality.
These projects wrap `stable-diffusion.cpp` for easier use in other languages/frameworks.
-* Golang (non-cgo): [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
-* Golang (cgo): [Binozo/GoStableDiffusion](https://github.com/Binozo/GoStableDiffusion)
+* Golang: [seasonjs/stable-diffusion](https://github.com/seasonjs/stable-diffusion)
* C#: [DarthAffe/StableDiffusion.NET](https://github.com/DarthAffe/StableDiffusion.NET)
* Python: [william-murray1204/stable-diffusion-cpp-python](https://github.com/william-murray1204/stable-diffusion-cpp-python)
* Rust: [newfla/diffusion-rs](https://github.com/newfla/diffusion-rs)
-* Flutter/Dart: [rmatif/Local-Diffusion](https://github.com/rmatif/Local-Diffusion)
## UIs
@@ -406,7 +404,6 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
- [Jellybox](https://jellybox.com)
- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
-- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
## Contributors
diff --git a/denoiser.hpp b/denoiser.hpp
index 2bd0b939a..b5359092d 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -489,7 +489,7 @@ struct FluxFlowDenoiser : public Denoiser {
}
};
-typedef std::function denoise_cb_t;
+typedef std::function denoise_cb_t;
// k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
static void sample_k_diffusion(sample_method_t method,
@@ -500,6 +500,8 @@ static void sample_k_diffusion(sample_method_t method,
std::shared_ptr rng,
float eta) {
size_t steps = sigmas.size() - 1;
+ bool early_stop = false;
+
// sample_euler_ancestral
switch (method) {
case EULER_A: {
@@ -510,7 +512,8 @@ static void sample_k_diffusion(sample_method_t method,
float sigma = sigmas[i];
// denoise
- ggml_tensor* denoised = model(x, sigma, i + 1);
+ ggml_tensor* denoised = model(x, sigma, i + 1, early_stop);
+ if (early_stop) break;
// d = (x - denoised) / sigma
{
@@ -563,7 +566,8 @@ static void sample_k_diffusion(sample_method_t method,
float sigma = sigmas[i];
// denoise
- ggml_tensor* denoised = model(x, sigma, i + 1);
+ ggml_tensor* denoised = model(x, sigma, i + 1, early_stop);
+ if (early_stop) break;
// d = (x - denoised) / sigma
{
@@ -594,7 +598,8 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) {
// denoise
- ggml_tensor* denoised = model(x, sigmas[i], -(i + 1));
+ ggml_tensor* denoised = model(x, sigmas[i], -(i + 1), early_stop);
+ if (early_stop) break;
// d = (x - denoised) / sigma
{
@@ -628,7 +633,9 @@ static void sample_k_diffusion(sample_method_t method,
vec_x2[j] = vec_x[j] + vec_d[j] * dt;
}
- ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
+ ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1, early_stop);
+ if (early_stop) break;
+
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
float d2 = (vec_x2[j] - vec_denoised[j]) / sigmas[i + 1];
@@ -644,7 +651,8 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) {
// denoise
- ggml_tensor* denoised = model(x, sigmas[i], i + 1);
+ ggml_tensor* denoised = model(x, sigmas[i], i + 1, early_stop);
+ if (early_stop) break;
// d = (x - denoised) / sigma
{
@@ -680,7 +688,9 @@ static void sample_k_diffusion(sample_method_t method,
vec_x2[j] = vec_x[j] + vec_d[j] * dt_1;
}
- ggml_tensor* denoised = model(x2, sigma_mid, i + 1);
+ ggml_tensor* denoised = model(x2, sigma_mid, i + 1, early_stop);
+ if (early_stop) break;
+
float* vec_denoised = (float*)denoised->data;
for (int j = 0; j < ggml_nelements(x); j++) {
float d2 = (vec_x2[j] - vec_denoised[j]) / sigma_mid;
@@ -697,7 +707,8 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) {
// denoise
- ggml_tensor* denoised = model(x, sigmas[i], i + 1);
+ ggml_tensor* denoised = model(x, sigmas[i], i + 1, early_stop);
+ if (early_stop) break;
// get_ancestral_step
float sigma_up = std::min(sigmas[i + 1],
@@ -741,7 +752,8 @@ static void sample_k_diffusion(sample_method_t method,
vec_x2[j] = (sigma_fn(s) / sigma_fn(t)) * vec_x[j] - (exp(-h * 0.5f) - 1) * vec_denoised[j];
}
- ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1);
+ ggml_tensor* denoised = model(x2, sigmas[i + 1], i + 1, early_stop);
+ if (early_stop) break;
// Second half-step
for (int j = 0; j < ggml_nelements(x); j++) {
@@ -771,7 +783,8 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) {
// denoise
- ggml_tensor* denoised = model(x, sigmas[i], i + 1);
+ ggml_tensor* denoised = model(x, sigmas[i], i + 1, early_stop);
+ if (early_stop) break;
float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]);
@@ -810,7 +823,8 @@ static void sample_k_diffusion(sample_method_t method,
for (int i = 0; i < steps; i++) {
// denoise
- ggml_tensor* denoised = model(x, sigmas[i], i + 1);
+ ggml_tensor* denoised = model(x, sigmas[i], i + 1, early_stop);
+ if (early_stop) break;
float t = t_fn(sigmas[i]);
float t_next = t_fn(sigmas[i + 1]);
@@ -860,7 +874,9 @@ static void sample_k_diffusion(sample_method_t method,
float* vec_x_next = (float*)x_next->data;
// Denoising step
- ggml_tensor* denoised = model(x_cur, sigma, i + 1);
+ ggml_tensor* denoised = model(x_cur, sigma, i + 1, early_stop);
+ if (early_stop) break;
+
float* vec_denoised = (float*)denoised->data;
// d_cur = (x_cur - denoised) / sigma
struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur);
@@ -931,7 +947,9 @@ static void sample_k_diffusion(sample_method_t method,
float t_next = sigmas[i + 1];
// Denoising step
- ggml_tensor* denoised = model(x, sigma, i + 1);
+ ggml_tensor* denoised = model(x, sigma, i + 1, early_stop);
+ if (early_stop) break;
+
float* vec_denoised = (float*)denoised->data;
struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x);
float* vec_d_cur = (float*)d_cur->data;
@@ -1003,7 +1021,8 @@ static void sample_k_diffusion(sample_method_t method,
float sigma = sigmas[i];
// denoise
- ggml_tensor* denoised = model(x, sigma, i + 1);
+ ggml_tensor* denoised = model(x, sigma, i + 1, early_stop);
+ if (early_stop) break;
// x = denoised
{
@@ -1129,7 +1148,8 @@ static void sample_k_diffusion(sample_method_t method,
// defined in Karras et al. (2022), p. 3, Table 1 and
// p. 8 (7), compare also p. 38 (226) therein.
struct ggml_tensor* model_output =
- model(x, sigma, i + 1);
+ model(x, sigma, i + 1, early_stop);
+ if (early_stop) break;
// Here model_output is still the k-diffusion denoiser
// output, not the U-net output F_theta(c_in(sigma) x;
// ...) in Karras et al. (2022), whereas Diffusers'
@@ -1288,7 +1308,8 @@ static void sample_k_diffusion(sample_method_t method,
}
}
struct ggml_tensor* model_output =
- model(x, sigma, i + 1);
+ model(x, sigma, i + 1, early_stop);
+ if (early_stop) break;
{
float* vec_x = (float*)x->data;
float* vec_model_output =
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index d06040445..5877e8196 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -61,6 +61,13 @@ const char* modes_str[] = {
"convert",
};
+const char* previews_str[] = {
+ "none",
+ "proj",
+ "tae",
+ "vae",
+};
+
enum SDMode {
TXT2IMG,
IMG2IMG,
@@ -136,6 +143,11 @@ struct SDParams {
bool chroma_use_dit_mask = true;
bool chroma_use_t5_mask = false;
int chroma_t5_mask_pad = 1;
+
+ sd_preview_t preview_method = SD_PREVIEW_NONE;
+ int preview_interval = 1;
+ std::string preview_path = "preview.png";
+ bool taesd_preview = false;
};
void print_params(SDParams params) {
@@ -192,6 +204,8 @@ void print_params(SDParams params) {
printf(" chroma_use_dit_mask: %s\n", params.chroma_use_dit_mask ? "true" : "false");
printf(" chroma_use_t5_mask: %s\n", params.chroma_use_t5_mask ? "true" : "false");
printf(" chroma_t5_mask_pad: %d\n", params.chroma_t5_mask_pad);
+ printf(" preview_mode: %s\n", previews_str[params.preview_method]);
+ printf(" preview_interval: %d\n", params.preview_interval);
}
void print_usage(int argc, const char* argv[]) {
@@ -208,7 +222,8 @@ void print_usage(int argc, const char* argv[]) {
printf(" --clip_g path to the clip-g text encoder\n");
printf(" --t5xxl path to the the t5xxl text encoder\n");
printf(" --vae [VAE] path to vae\n");
- printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
+ printf(" --taesd [TAESD] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
+ printf(" --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[SD_PREVIEW_TAE]);
printf(" --control-net [CONTROL_PATH] path to control net model\n");
printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n");
printf(" --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings\n");
@@ -262,6 +277,10 @@ void print_usage(int argc, const char* argv[]) {
printf(" --chroma-disable-dit-mask disable dit mask for chroma\n");
printf(" --chroma-enable-t5-mask enable t5 mask for chroma\n");
printf(" --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma\n");
+ printf(" --preview {%s,%s,%s,%s} preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[SD_PREVIEW_NONE]);
+ printf(" %s is the fastest\n", previews_str[SD_PREVIEW_PROJ]);
+ printf(" --preview-interval [N] How often to save the image preview");
+ printf(" --preview-path [PATH} path to write preview image to (default: ./preview.png)\n");
printf(" -v, --verbose print extra info\n");
}
@@ -525,6 +544,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
params.diffusion_flash_attn = true; // can reduce MEM significantly
} else if (arg == "--canny") {
params.canny_preprocess = true;
+ } else if (arg == "--taesd-preview-only") {
+ params.taesd_preview = true;
} else if (arg == "-b" || arg == "--batch-count") {
if (++i >= argc) {
invalid_arg = true;
@@ -663,7 +684,36 @@ void parse_args(int argc, const char** argv, SDParams& params) {
break;
}
params.chroma_t5_mask_pad = std::stoi(argv[i]);
- } else {
+ } else if (arg == "--preview") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ const char* preview = argv[i];
+ int preview_method = -1;
+ for (int m = 0; m < N_PREVIEWS; m++) {
+ if (!strcmp(preview, previews_str[m])) {
+ preview_method = m;
+ }
+ }
+ if (preview_method == -1) {
+ invalid_arg = true;
+ break;
+ }
+ params.preview_method = (sd_preview_t)preview_method;
+ } else if (arg == "--preview-interval") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.preview_interval = std::stoi(argv[i]);
+ } else if (arg == "--preview-path") {
+ if (++i >= argc) {
+ invalid_arg = true;
+ break;
+ }
+ params.preview_path = argv[i];
+ } else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv);
exit(1);
@@ -827,12 +877,20 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
fflush(out_stream);
}
+const char* preview_path;
+
+void step_callback(int step, int steps, int batch, sd_image_t image) {
+ stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0);
+}
+
int main(int argc, const char* argv[]) {
SDParams params;
parse_args(argc, argv, params);
+ preview_path = params.preview_path.c_str();
sd_set_log_callback(sd_log_cb, (void*)¶ms);
+ sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval);
if (params.verbose) {
print_params(params);
@@ -975,7 +1033,8 @@ int main(int argc, const char* argv[]) {
params.diffusion_flash_attn,
params.chroma_use_dit_mask,
params.chroma_use_t5_mask,
- params.chroma_t5_mask_pad);
+ params.chroma_t5_mask_pad,
+ params.taesd_preview);
if (sd_ctx == NULL) {
printf("new_sd_ctx_t failed\n");
@@ -1042,7 +1101,8 @@ int main(int argc, const char* argv[]) {
params.skip_layers.size(),
params.slg_scale,
params.skip_layer_start,
- params.skip_layer_end);
+ params.skip_layer_end,
+ false);
} else if (params.mode == IMG2IMG || params.mode == IMG2VID) {
sd_image_t input_image = {(uint32_t)params.width,
(uint32_t)params.height,
@@ -1111,7 +1171,8 @@ int main(int argc, const char* argv[]) {
params.skip_layers.size(),
params.slg_scale,
params.skip_layer_start,
- params.skip_layer_end);
+ params.skip_layer_end,
+ false);
}
} else { // EDIT
results = edit(sd_ctx,
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 9f6a4fef6..2724c095a 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -620,7 +620,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
params.mem_buffer = NULL;
params.no_alloc = false;
- LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
+ LOG_INFO("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f);
// draft context
struct ggml_context* tiles_ctx = ggml_init(params);
@@ -634,8 +634,8 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1);
on_processing(input_tile, NULL, true);
int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap);
- LOG_INFO("processing %i tiles", num_tiles);
- pretty_progress(1, num_tiles, 0.0f);
+ LOG_DEBUG("processing %i tiles", num_tiles);
+ pretty_progress(1, num_tiles, 0.0f, 0);
int tile_count = 1;
bool last_y = false, last_x = false;
float last_time = 0.0f;
@@ -655,13 +655,13 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const
ggml_merge_tensor_2d(output_tile, output, x * scale, y * scale, tile_overlap * scale);
int64_t t2 = ggml_time_ms();
last_time = (t2 - t1) / 1000.0f;
- pretty_progress(tile_count, num_tiles, last_time);
+ pretty_progress(tile_count, num_tiles, last_time, 0);
tile_count++;
}
last_x = false;
}
if (tile_count < num_tiles) {
- pretty_progress(num_tiles, num_tiles, last_time);
+ pretty_progress(num_tiles, num_tiles, last_time, 0);
}
ggml_free(tiles_ctx);
}
@@ -1142,7 +1142,7 @@ struct GGMLRunner {
// compute the required memory
size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0);
- LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
+ LOG_INFO("%s compute buffer size: %.2f MB(%s)",
get_desc().c_str(),
compute_buffer_size / 1024.0 / 1024.0,
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
diff --git a/latent-preview.h b/latent-preview.h
new file mode 100644
index 000000000..7dee66f6d
--- /dev/null
+++ b/latent-preview.h
@@ -0,0 +1,83 @@
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169
+const float flux_latent_rgb_proj[16][3] = {
+ {-0.0346f, 0.0244f, 0.0681f},
+ {0.0034f, 0.0210f, 0.0687f},
+ {0.0275f, -0.0668f, -0.0433f},
+ {-0.0174f, 0.0160f, 0.0617f},
+ {0.0859f, 0.0721f, 0.0329f},
+ {0.0004f, 0.0383f, 0.0115f},
+ {0.0405f, 0.0861f, 0.0915f},
+ {-0.0236f, -0.0185f, -0.0259f},
+ {-0.0245f, 0.0250f, 0.1180f},
+ {0.1008f, 0.0755f, -0.0421f},
+ {-0.0515f, 0.0201f, 0.0011f},
+ {0.0428f, -0.0012f, -0.0036f},
+ {0.0817f, 0.0765f, 0.0749f},
+ {-0.1264f, -0.0522f, -0.1103f},
+ {-0.0280f, -0.0881f, -0.0499f},
+ {-0.1262f, -0.0982f, -0.0778f}};
+
+// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246
+const float sd3_latent_rgb_proj[16][3] = {
+ {-0.0645f, 0.0177f, 0.1052f},
+ {0.0028f, 0.0312f, 0.0650f},
+ {0.1848f, 0.0762f, 0.0360f},
+ {0.0944f, 0.0360f, 0.0889f},
+ {0.0897f, 0.0506f, -0.0364f},
+ {-0.0020f, 0.1203f, 0.0284f},
+ {0.0855f, 0.0118f, 0.0283f},
+ {-0.0539f, 0.0658f, 0.1047f},
+ {-0.0057f, 0.0116f, 0.0700f},
+ {-0.0412f, 0.0281f, -0.0039f},
+ {0.1106f, 0.1171f, 0.1220f},
+ {-0.0248f, 0.0682f, -0.0481f},
+ {0.0815f, 0.0846f, 0.1207f},
+ {-0.0120f, -0.0055f, -0.0867f},
+ {-0.0749f, -0.0634f, -0.0456f},
+ {-0.1418f, -0.1457f, -0.1259f},
+};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
+const float sdxl_latent_rgb_proj[4][3] = {
+ {0.3651f, 0.4232f, 0.4341f},
+ {-0.2533f, -0.0042f, 0.1068f},
+ {0.1076f, 0.1111f, -0.0362f},
+ {-0.3165f, -0.2492f, -0.2188f}};
+
+// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38
+const float sd_latent_rgb_proj[4][3]{
+ {0.3512f, 0.2297f, 0.3227f},
+ {0.3250f, 0.4974f, 0.2350f},
+ {-0.2829f, 0.1762f, 0.2721f},
+ {-0.2120f, -0.2616f, -0.7177f}};
+
+void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) {
+ size_t buffer_head = 0;
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ size_t latent_id = (i * latents->nb[0] + j * latents->nb[1]);
+ float r = 0, g = 0, b = 0;
+ for (int d = 0; d < dim; d++) {
+ float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]);
+ r += value * latent_rgb_proj[d][0];
+ g += value * latent_rgb_proj[d][1];
+ b += value * latent_rgb_proj[d][2];
+ }
+
+ // change range
+ r = r * .5f + .5f;
+ g = g * .5f + .5f;
+ b = b * .5f + .5f;
+
+ // clamp rgb values to [0,1] range
+ r = r >= 0 ? r <= 1 ? r : 1 : 0;
+ g = g >= 0 ? g <= 1 ? g : 1 : 0;
+ b = b >= 0 ? b <= 1 ? b : 1 : 0;
+
+ buffer[buffer_head++] = (uint8_t)(r * 255);
+ buffer[buffer_head++] = (uint8_t)(g * 255);
+ buffer[buffer_head++] = (uint8_t)(b * 255);
+ }
+ }
+}
\ No newline at end of file
diff --git a/model.cpp b/model.cpp
index 333f5d36a..7617e0f72 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1923,7 +1923,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
}
}
int64_t t2 = ggml_time_ms();
- pretty_progress(++tensor_count, processed_tensor_storages.size(), (t2 - t1) / 1000.0f);
+ pretty_progress(++tensor_count, processed_tensor_storages.size(), (t2 - t1) / 1000.0f, 0);
t1 = t2;
}
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index b5860cfd3..d9f272218 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -20,6 +20,10 @@
#define STB_IMAGE_STATIC
#include "stb_image.h"
+#include "latent-preview.h"
+#include
+#include
+
// #define STB_IMAGE_WRITE_IMPLEMENTATION
// #define STB_IMAGE_WRITE_STATIC
// #include "stb_image_write.h"
@@ -67,6 +71,102 @@ void calculate_alphas_cumprod(float* alphas_cumprod,
}
}
+void suppress_pp(int step, int steps, float time, void* data) {
+ (void)step;
+ (void)steps;
+ (void)time;
+ (void)data;
+ return;
+}
+
+class StableThreads {
+private:
+ static inline std::mutex mtx;
+ static inline std::condition_variable cv;
+ static inline std::thread::id current_index = {};
+ static inline std::vector thread_indices;
+ static inline std::unordered_map thread_cancel;
+ static inline std::unordered_map batch_thread;
+public:
+
+ static int count() {
+ std::unique_lock lock(mtx);
+ return thread_indices.size();
+ }
+
+ static void start() {
+ std::unique_lock lock(mtx);
+ current_index = thread_indices[0];
+ cv.notify_all();
+ }
+
+ static void clear() {
+ std::unique_lock lock(mtx);
+ thread_indices.clear();
+ thread_cancel.clear();
+ batch_thread.clear();
+ }
+
+ static void register_thread(int id) {
+ std::unique_lock lock(mtx);
+ thread_indices.push_back(std::this_thread::get_id());
+ batch_thread[id] = std::this_thread::get_id();
+
+ cv.wait(lock, []() {
+ return std::this_thread::get_id() == current_index;
+ });
+ }
+
+ static void remove_thread() {
+ std::unique_lock lock(mtx);
+
+ auto it = std::find(thread_indices.begin(), thread_indices.end(), std::this_thread::get_id());
+ long long indice;
+ if (it != thread_indices.end()) {
+ indice = std::distance(thread_indices.begin(), it);
+ thread_indices.erase(it);
+ } else {
+ indice = 0;
+ }
+ if (thread_indices.empty()) {
+ return;
+ }
+ current_index = thread_indices[indice % thread_indices.size()];
+ cv.notify_all();
+ }
+
+ static void move_next() {
+ std::unique_lock lock(mtx);
+ if (thread_indices.size() < 2)
+ return;
+
+ auto it = std::find(thread_indices.begin(), thread_indices.end(), std::this_thread::get_id());
+ long long indice;
+ if (it != thread_indices.end()) {
+ indice = std::distance(thread_indices.begin(), it);
+ } else {
+ indice = 0;
+ }
+
+ current_index = thread_indices[(indice + 1) % thread_indices.size()];
+ cv.notify_all();
+ cv.wait(lock, []() {
+ return std::this_thread::get_id() == current_index;
+ });
+ }
+
+ static bool is_request_cancel() {
+ std::unique_lock lock(mtx);
+ return thread_cancel.find(std::this_thread::get_id()) != thread_cancel.end();
+ }
+
+ static void request_cancel(int id) {
+ std::unique_lock lock(mtx);
+ if (batch_thread.find(id) != batch_thread.end()) {
+ thread_cancel[batch_thread[id]] = true;
+ }
+ }
+};
/*=============================================== StableDiffusionGGML ================================================*/
class StableDiffusionGGML {
@@ -164,7 +264,8 @@ class StableDiffusionGGML {
bool diffusion_flash_attn,
bool chroma_use_dit_mask,
bool chroma_use_t5_mask,
- int chroma_t5_mask_pad) {
+ int chroma_t5_mask_pad,
+ bool tae_preview_only) {
use_tiny_autoencoder = taesd_path.size() > 0;
#ifdef SD_USE_CUDA
LOG_DEBUG("Using CUDA backend");
@@ -375,7 +476,7 @@ class StableDiffusionGGML {
diffusion_model->alloc_params_buffer();
diffusion_model->get_param_tensors(tensors);
- if (!use_tiny_autoencoder) {
+ if (!use_tiny_autoencoder || tae_preview_only) {
if (vae_on_cpu && !ggml_backend_is_cpu(backend)) {
LOG_INFO("VAE Autoencoder: Using CPU backend");
vae_backend = ggml_backend_cpu_init();
@@ -385,8 +486,16 @@ class StableDiffusionGGML {
first_stage_model = std::make_shared(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version);
first_stage_model->alloc_params_buffer();
first_stage_model->get_param_tensors(tensors, "first_stage_model");
- } else {
- tae_first_stage = std::make_shared(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version);
+ }
+ if (use_tiny_autoencoder) {
+ ggml_backend_t tae_backend;
+ if (vae_on_cpu && !ggml_backend_is_cpu(backend)) {
+ LOG_INFO("TAE Autoencoder: Using CPU backend");
+ tae_backend = ggml_backend_cpu_init();
+ } else {
+ tae_backend = ggml_backend_cpu_init();
+ }
+ tae_first_stage = std::make_shared(tae_backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version);
}
// first_stage_model->get_param_tensors(tensors, "first_stage_model.");
@@ -477,9 +586,10 @@ class StableDiffusionGGML {
size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size();
size_t unet_params_mem_size = diffusion_model->get_params_buffer_size();
size_t vae_params_mem_size = 0;
- if (!use_tiny_autoencoder) {
+ if (!use_tiny_autoencoder || tae_preview_only) {
vae_params_mem_size = first_stage_model->get_params_buffer_size();
- } else {
+ }
+ if (use_tiny_autoencoder) {
if (!tae_first_stage->load_from_file(taesd_path)) {
return false;
}
@@ -631,6 +741,7 @@ class StableDiffusionGGML {
LOG_DEBUG("finished loaded file");
ggml_free(ctx);
+ use_tiny_autoencoder = use_tiny_autoencoder && !tae_preview_only;
return true;
}
@@ -817,7 +928,118 @@ class StableDiffusionGGML {
return {c_crossattn, y, c_concat};
}
- ggml_tensor* sample(ggml_context* work_ctx,
+ void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) {
+ sd_progress_cb_t cb = sd_get_progress_callback();
+ void* cbd = sd_get_progress_callback_data();
+ sd_set_progress_callback((sd_progress_cb_t)suppress_pp, NULL);
+ sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing);
+ sd_set_progress_callback(cb, cbd);
+ }
+
+ void preview_image(ggml_context* work_ctx,
+ int step,
+ int steps,
+ int batch,
+ struct ggml_tensor* latents,
+ enum SDVersion version,
+ sd_preview_t preview_mode,
+ ggml_tensor* result,
+ std::function step_callback) {
+ const uint32_t channel = 3;
+ uint32_t width = latents->ne[0];
+ uint32_t height = latents->ne[1];
+ uint32_t dim = latents->ne[2];
+ if (preview_mode == SD_PREVIEW_PROJ) {
+ const float(*latent_rgb_proj)[channel];
+
+ if (dim == 16) {
+ // 16 channels VAE -> Flux or SD3
+
+ if (sd_version_is_sd3(version)) {
+ latent_rgb_proj = sd3_latent_rgb_proj;
+ } else if (sd_version_is_flux(version)) {
+ latent_rgb_proj = flux_latent_rgb_proj;
+ } else {
+ LOG_WARN("No latent to RGB projection known for this model");
+ // unknown model
+ return;
+ }
+
+ } else if (dim == 4) {
+ // 4 channels VAE
+ if (sd_version_is_sdxl(version)) {
+ latent_rgb_proj = sdxl_latent_rgb_proj;
+ } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
+ latent_rgb_proj = sd_latent_rgb_proj;
+ } else {
+ // unknown model
+ LOG_WARN("No latent to RGB projection known for this model");
+ return;
+ }
+ } else {
+ LOG_WARN("No latent to RGB projection known for this model");
+ // unknown latent space
+ return;
+ }
+ uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t));
+
+ preview_latent_image(data, latents, latent_rgb_proj, width, height, dim);
+ sd_image_t image = {
+ width,
+ height,
+ channel,
+ data};
+ step_callback(step, steps, batch, image);
+ free(image.data);
+ } else {
+ if (preview_mode == SD_PREVIEW_VAE) {
+ ggml_tensor_scale(latents, 1.0f / scale_factor);
+ if (vae_tiling) {
+ // split latent in 32x32 tiles and compute in several steps
+ auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+ first_stage_model->compute(n_threads, in, true, &out);
+ };
+ silent_tiling(latents, result, 8, 32, 0.5f, on_tiling);
+
+ } else {
+ first_stage_model->compute(n_threads, latents, true, &result);
+ }
+ first_stage_model->free_compute_buffer();
+ ggml_tensor_scale(latents, scale_factor);
+
+ ggml_tensor_scale_output(result);
+ } else if (preview_mode == SD_PREVIEW_TAE) {
+ if (tae_first_stage == nullptr) {
+ LOG_WARN("TAE not found for preview");
+ return;
+ }
+ if (vae_tiling && false) {
+ // split latent in 64x64 tiles and compute in several steps
+ auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
+ tae_first_stage->compute(n_threads, in, true, &out);
+ };
+ silent_tiling(latents, result, 8, 64, 0.5f, on_tiling);
+ } else {
+ tae_first_stage->compute(n_threads, latents, true, &result);
+ }
+ tae_first_stage->free_compute_buffer();
+ } else {
+ return;
+ }
+ ggml_tensor_clamp(result, 0.0f, 1.0f);
+ sd_image_t image = {
+ width * 8,
+ height * 8,
+ channel,
+ sd_tensor_to_image(result)};
+ ggml_tensor_scale(result, 0);
+ step_callback(step, steps, batch, image);
+ free(image.data);
+ }
+ }
+
+ ggml_tensor*
+ sample(ggml_context* work_ctx,
ggml_tensor* init_latent,
ggml_tensor* noise,
SDCondition cond,
@@ -837,7 +1059,8 @@ class StableDiffusionGGML {
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2,
- ggml_tensor* noise_mask = nullptr) {
+ ggml_tensor* noise_mask = nullptr,
+ int batchId = 0) {
LOG_DEBUG("Sample");
struct ggml_init_params params;
size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
@@ -880,9 +1103,19 @@ class StableDiffusionGGML {
}
struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
- auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
+ struct ggml_tensor* preview_tensor = NULL;
+ auto sd_preview_mode = sd_get_preview_mode();
+ if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) {
+ preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32,
+ (denoised->ne[0] * 8),
+ (denoised->ne[1] * 8),
+ 3,
+ denoised->ne[3]);
+ }
+
+ auto denoise = [&](ggml_tensor* input, float sigma, int step, bool& early_stop) -> ggml_tensor* {
if (step == 1) {
- pretty_progress(0, (int)steps, 0);
+ pretty_progress(0, (int)steps, 0, 0);
}
int64_t t0 = ggml_time_us();
@@ -1008,10 +1241,6 @@ class StableDiffusionGGML {
vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
}
int64_t t1 = ggml_time_us();
- if (step > 0) {
- pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
- // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
- }
if (noise_mask != nullptr) {
for (int64_t x = 0; x < denoised->ne[0]; x++) {
for (int64_t y = 0; y < denoised->ne[1]; y++) {
@@ -1024,7 +1253,25 @@ class StableDiffusionGGML {
}
}
}
-
+ if (step > 0) {
+ pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f, batchId);
+ // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
+ }
+ auto sd_preview_cb = sd_get_preview_callback();
+ auto sd_preview_mode = sd_get_preview_mode();
+ if (sd_preview_cb != NULL) {
+ if (step == 1 || step >= steps || step % sd_get_preview_interval() == 0) {
+ preview_image(work_ctx, step, (int)steps, batchId, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb);
+ LOG_INFO("Preview Rendered %d ", batchId);
+ StableThreads::move_next();
+ }
+ }
+ if (!early_stop) {
+ early_stop = StableThreads::is_request_cancel();
+ }
+ if (early_stop) {
+ LOG_INFO("Image cancelled %d ", batchId);
+ }
return denoised;
};
@@ -1170,7 +1417,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
bool diffusion_flash_attn,
bool chroma_use_dit_mask,
bool chroma_use_t5_mask,
- int chroma_t5_mask_pad) {
+ int chroma_t5_mask_pad,
+ bool tae_preview_only) {
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
if (sd_ctx == NULL) {
return NULL;
@@ -1215,7 +1463,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
diffusion_flash_attn,
chroma_use_dit_mask,
chroma_use_t5_mask,
- chroma_t5_mask_pad)) {
+ chroma_t5_mask_pad,
+ tae_preview_only)) {
delete sd_ctx->sd;
sd_ctx->sd = NULL;
free(sd_ctx);
@@ -1232,6 +1481,10 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) {
free(sd_ctx);
}
+void sd_request_cancel_batch(int id) {
+ StableThreads::request_cancel(id);
+}
+
sd_image_t* generate_image(sd_ctx_t* sd_ctx,
struct ggml_context* work_ctx,
ggml_tensor* init_latent,
@@ -1257,7 +1510,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
float slg_scale = 0,
float skip_layer_start = 0.01,
float skip_layer_end = 0.2,
- ggml_tensor* masked_image = NULL) {
+ ggml_tensor* masked_image = NULL,
+ bool skip_image_gen = false) {
if (seed < 0) {
// Generally, when using the provided command line, the seed is always >0.
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1434,7 +1688,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
}
// Sample
- std::vector final_latents; // collect latents to decode
int C = 4;
if (sd_version_is_sd3(sd_ctx->sd->version)) {
C = 16;
@@ -1477,89 +1730,105 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
} else {
noise_mask = masked_image;
}
- for (int b = 0; b < batch_count; b++) {
- int64_t sampling_start = ggml_time_ms();
- int64_t cur_seed = seed + b;
- LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
-
- sd_ctx->sd->rng->manual_seed(cur_seed);
- struct ggml_tensor* x_t = init_latent;
- struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
- ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
-
- int start_merge_step = -1;
- if (sd_ctx->sd->stacked_id) {
- start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps);
- // if (start_merge_step > 30)
- // start_merge_step = 30;
- LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
- }
-
- struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx,
- x_t,
- noise,
- cond,
- uncond,
- image_hint,
- control_strength,
- cfg_scale,
- cfg_scale,
- guidance,
- eta,
- sample_method,
- sigmas,
- start_merge_step,
- id_cond,
- ref_latents,
- skip_layers,
- slg_scale,
- skip_layer_start,
- skip_layer_end,
- noise_mask);
-
- // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
- // print_ggml_tensor(x_0);
- int64_t sampling_end = ggml_time_ms();
- LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
- final_latents.push_back(x_0);
- }
-
- if (sd_ctx->sd->free_params_immediately) {
- sd_ctx->sd->diffusion_model->free_params_buffer();
- }
- int64_t t3 = ggml_time_ms();
- LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000);
-
- // Decode to image
- LOG_INFO("decoding %zu latents", final_latents.size());
- std::vector decoded_images; // collect decoded images
- for (size_t i = 0; i < final_latents.size(); i++) {
- t1 = ggml_time_ms();
- struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
- // print_ggml_tensor(img);
- if (img != NULL) {
- decoded_images.push_back(img);
- }
- int64_t t2 = ggml_time_ms();
- LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", i + 1, (t2 - t1) * 1.0f / 1000);
- }
- int64_t t4 = ggml_time_ms();
- LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000);
- if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
- sd_ctx->sd->first_stage_model->free_params_buffer();
- }
sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t));
if (result_images == NULL) {
ggml_free(work_ctx);
return NULL;
}
- for (size_t i = 0; i < decoded_images.size(); i++) {
+ for (size_t i = 0; i < batch_count; i++) {
result_images[i].width = width;
result_images[i].height = height;
result_images[i].channel = 3;
- result_images[i].data = sd_tensor_to_image(decoded_images[i]);
+ result_images[i].data = nullptr;
+ }
+
+ StableThreads::clear();
+ std::vector threads;
+ for (int b = 0; b < batch_count; b++) {
+ threads.emplace_back(([&, b]() {
+ StableThreads::register_thread(b);
+ if (StableThreads::is_request_cancel()) {
+ StableThreads::remove_thread();
+ return;
+ }
+
+ struct ggml_tensor* x_0;
+ int64_t sampling_start = ggml_time_ms();
+ int64_t cur_seed = seed + b;
+ LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
+
+ sd_ctx->sd->rng->manual_seed(cur_seed);
+ struct ggml_tensor* x_t = init_latent;
+ struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+ ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
+
+ int start_merge_step = -1;
+ if (sd_ctx->sd->stacked_id) {
+ start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps);
+ // if (start_merge_step > 30)
+ // start_merge_step = 30;
+ LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step);
+ }
+
+ x_0 = sd_ctx->sd->sample(work_ctx,
+ x_t,
+ noise,
+ cond,
+ uncond,
+ image_hint,
+ control_strength,
+ cfg_scale,
+ cfg_scale,
+ guidance,
+ eta,
+ sample_method,
+ sigmas,
+ start_merge_step,
+ id_cond,
+ ref_latents,
+ skip_layers,
+ slg_scale,
+ skip_layer_start,
+ skip_layer_end,
+ noise_mask, b);
+
+ int64_t sampling_end = ggml_time_ms();
+ LOG_INFO("sampling completed, taking %.2fs", (sampling_end - sampling_start) * 1.0f / 1000);
+ if (!StableThreads::is_request_cancel()) {
+ if (!skip_image_gen) {
+ LOG_INFO("VAE Rendering %d ", b);
+ struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0);
+ result_images[b].data = sd_tensor_to_image(img);
+ int64_t decode_end = ggml_time_ms();
+ LOG_INFO("Image Rendered %d ", b);
+ LOG_INFO("latent %" PRId64 " decoded, taking %.2fs", b, (decode_end - sampling_end) * 1.0f / 1000);
+ auto callback = sd_get_preview_callback();
+ callback(-1, -1, b, result_images[b]);
+ }
+ }
+
+ StableThreads::remove_thread();
+ }));
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ }
+ while (StableThreads::count() < batch_count) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ }
+ LOG_INFO("Start generate");
+ StableThreads::start();
+ for (auto& t : threads) {
+ t.join();
+ }
+ StableThreads::clear();
+
+ if (sd_ctx->sd->free_params_immediately) {
+ sd_ctx->sd->diffusion_model->free_params_buffer();
+ }
+
+ if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
+ sd_ctx->sd->first_stage_model->free_params_buffer();
}
ggml_free(work_ctx);
@@ -1611,7 +1880,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
size_t skip_layers_count = 0,
float slg_scale = 0,
float skip_layer_start = 0.01,
- float skip_layer_end = 0.2) {
+ float skip_layer_end = 0.2,
+ bool skip_image_gen = false) {
std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
LOG_DEBUG("txt2img %dx%d", width, height);
if (sd_ctx == NULL) {
@@ -1629,6 +1899,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
if (sd_ctx->sd->stacked_id) {
params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB
}
+ auto sd_preview_mode = sd_get_preview_mode();
+ if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) {
+ params.mem_size *= 2;
+ }
params.mem_size += width * height * 3 * sizeof(float);
params.mem_size *= batch_count;
params.mem_buffer = NULL;
@@ -1675,7 +1949,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
skip_layers_vec,
slg_scale,
skip_layer_start,
- skip_layer_end);
+ skip_layer_end,
+ NULL,
+ skip_image_gen);
size_t t1 = ggml_time_ms();
@@ -1709,7 +1985,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
size_t skip_layers_count = 0,
float slg_scale = 0,
float skip_layer_start = 0.01,
- float skip_layer_end = 0.2) {
+ float skip_layer_end = 0.2,
+ bool skip_image_gen = false) {
std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
LOG_DEBUG("img2img %dx%d", width, height);
if (sd_ctx == NULL) {
@@ -1727,6 +2004,10 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
if (sd_ctx->sd->stacked_id) {
params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB
}
+ auto sd_preview_mode = sd_get_preview_mode();
+ if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) {
+ params.mem_size *= 2;
+ }
params.mem_size += width * height * 3 * sizeof(float) * 3;
params.mem_size *= batch_count;
params.mem_buffer = NULL;
@@ -1857,7 +2138,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
slg_scale,
skip_layer_start,
skip_layer_end,
- masked_image);
+ masked_image,
+ skip_image_gen);
size_t t2 = ggml_time_ms();
@@ -1959,7 +2241,9 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
sample_method,
sigmas,
-1,
- SDCondition(NULL, NULL, NULL));
+ SDCondition(NULL, NULL, NULL),
+ {}, {},
+ 0, 0.001, 0.2, NULL);
int64_t t2 = ggml_time_ms();
LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index b4d6fc327..a6ac4703d 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -112,13 +112,13 @@ enum sd_log_level_t {
SD_LOG_ERROR
};
-typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
-typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
-
-SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
-SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
-SD_API int32_t get_num_physical_cores();
-SD_API const char* sd_get_system_info();
+enum sd_preview_t {
+ SD_PREVIEW_NONE,
+ SD_PREVIEW_PROJ,
+ SD_PREVIEW_TAE,
+ SD_PREVIEW_VAE,
+ N_PREVIEWS
+};
typedef struct {
uint32_t width;
@@ -127,6 +127,18 @@ typedef struct {
uint8_t* data;
} sd_image_t;
+typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data);
+typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data);
+typedef void (*sd_preview_cb_t)(int, int, int, sd_image_t);
+
+
+SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
+SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
+SD_API void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode, int interval);
+SD_API int32_t get_num_physical_cores();
+SD_API const char* sd_get_system_info();
+SD_API void sd_request_cancel_batch(int id);
+
typedef struct sd_ctx_t sd_ctx_t;
SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
@@ -153,7 +165,8 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
bool diffusion_flash_attn,
bool chroma_use_dit_mask,
bool chroma_use_t5_mask,
- int chroma_t5_mask_pad);
+ int chroma_t5_mask_pad,
+ bool tae_preview_only);
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
@@ -179,7 +192,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
size_t skip_layers_count,
float slg_scale,
float skip_layer_start,
- float skip_layer_end);
+ float skip_layer_end,
+ bool skip_image_gen);
SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
sd_image_t init_image,
@@ -206,7 +220,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
size_t skip_layers_count,
float slg_scale,
float skip_layer_start,
- float skip_layer_end);
+ float skip_layer_end,
+ bool skip_image_gen);
SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
sd_image_t init_image,
diff --git a/util.cpp b/util.cpp
index 631c12066..27b29f3eb 100644
--- a/util.cpp
+++ b/util.cpp
@@ -247,6 +247,10 @@ int32_t get_num_physical_cores() {
static sd_progress_cb_t sd_progress_cb = NULL;
void* sd_progress_cb_data = NULL;
+static sd_preview_cb_t sd_preview_cb = NULL;
+sd_preview_t sd_preview_mode = SD_PREVIEW_NONE;
+int sd_preview_interval = 1;
+
std::u32string utf8_to_utf32(const std::string& utf8_str) {
std::wstring_convert, char32_t> converter;
return converter.from_bytes(utf8_str);
@@ -340,7 +344,7 @@ sd_image_t* preprocess_id_image(sd_image_t* img) {
return resized;
}
-void pretty_progress(int step, int steps, float time) {
+void pretty_progress(int step, int steps, float time, int batch) {
if (sd_progress_cb) {
sd_progress_cb(step, steps, time, sd_progress_cb_data);
return;
@@ -361,8 +365,8 @@ void pretty_progress(int step, int steps, float time) {
}
}
progress += "|";
- printf(time > 1.0f ? "\r%s %i/%i - %.2fs/it" : "\r%s %i/%i - %.2fit/s\033[K",
- progress.c_str(), step, steps,
+ printf(time > 1.0f ? "\r%s %i/%i (%i) - %.2fs/it\n" : "\r%s %i/%i (%i) - %.2fit/s\n",
+ progress.c_str(), step, steps, batch,
time > 1.0f || time == 0 ? time : (1.0f / time));
fflush(stdout); // for linux
if (step == steps) {
@@ -420,6 +424,29 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) {
sd_progress_cb = cb;
sd_progress_cb_data = data;
}
+void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode = SD_PREVIEW_PROJ, int interval = 1) {
+ sd_preview_cb = cb;
+ sd_preview_mode = mode;
+ sd_preview_interval = interval;
+}
+
+sd_preview_cb_t sd_get_preview_callback() {
+ return sd_preview_cb;
+}
+
+sd_preview_t sd_get_preview_mode() {
+ return sd_preview_mode;
+}
+int sd_get_preview_interval() {
+ return sd_preview_interval;
+}
+
+sd_progress_cb_t sd_get_progress_callback() {
+ return sd_progress_cb;
+}
+void* sd_get_progress_callback_data() {
+ return sd_progress_cb_data;
+}
const char* sd_get_system_info() {
static char buffer[1024];
std::stringstream ss;
diff --git a/util.h b/util.h
index 14fa812e5..265d0e9d1 100644
--- a/util.h
+++ b/util.h
@@ -46,7 +46,7 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int size);
std::string path_join(const std::string& p1, const std::string& p2);
std::vector splitString(const std::string& str, char delimiter);
-void pretty_progress(int step, int steps, float time);
+void pretty_progress(int step, int steps, float time, int batch);
void log_printf(sd_log_level_t level, const char* file, int line, const char* format, ...);
@@ -54,6 +54,13 @@ std::string trim(const std::string& s);
std::vector> parse_prompt_attention(const std::string& text);
+sd_progress_cb_t sd_get_progress_callback();
+void* sd_get_progress_callback_data();
+
+sd_preview_cb_t sd_get_preview_callback();
+sd_preview_t sd_get_preview_mode();
+int sd_get_preview_interval();
+
#define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
#define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)