Skip to content

Commit 2eac844

Browse files
committed
fix: generate image correctly in img2img mode
1 parent 968226a commit 2eac844

File tree

1 file changed

+62
-32
lines changed

1 file changed

+62
-32
lines changed

stable-diffusion.cpp

Lines changed: 62 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false) {
131131
if (shape_only) {
132132
return;
133133
}
134-
int range = 1000;
134+
int range = 3;
135135
for (int i = 0; i < tensor->ne[3]; i++) {
136136
if (i >= range && i + range < tensor->ne[3]) {
137137
continue;
@@ -335,7 +335,7 @@ void sd_image_to_tensor(const uint8_t* image_data,
335335
}
336336
}
337337

338-
float sd_mean(struct ggml_tensor* src) {
338+
float ggml_tensor_mean(struct ggml_tensor* src) {
339339
float mean = 0.0f;
340340
int64_t nelements = ggml_nelements(src);
341341
float* data = (float*)src->data;
@@ -345,15 +345,26 @@ float sd_mean(struct ggml_tensor* src) {
345345
return mean;
346346
}
347347

348-
void sd_scale(struct ggml_tensor* src, float scale) {
348+
// a = a+b
349+
void ggml_tensor_add(struct ggml_tensor* a, struct ggml_tensor* b) {
350+
GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
351+
int64_t nelements = ggml_nelements(a);
352+
float* vec_a = (float*)a->data;
353+
float* vec_b = (float*)b->data;
354+
for (int i = 0; i < nelements; i++) {
355+
vec_a[i] = vec_a[i] + vec_b[i];
356+
}
357+
}
358+
359+
void ggml_tensor_scale(struct ggml_tensor* src, float scale) {
349360
int64_t nelements = ggml_nelements(src);
350361
float* data = (float*)src->data;
351362
for (int i = 0; i < nelements; i++) {
352363
data[i] = data[i] * scale;
353364
}
354365
}
355366

356-
void sd_clamp(struct ggml_tensor* src, float min, float max) {
367+
void ggml_tensor_clamp(struct ggml_tensor* src, float min, float max) {
357368
int64_t nelements = ggml_nelements(src);
358369
float* data = (float*)src->data;
359370
for (int i = 0; i < nelements; i++) {
@@ -363,7 +374,7 @@ void sd_clamp(struct ggml_tensor* src, float min, float max) {
363374
}
364375

365376
// convert values from [0, 1] to [-1, 1]
366-
void sd_convert_input(struct ggml_tensor* src) {
377+
void ggml_tensor_scale_input(struct ggml_tensor* src) {
367378
int64_t nelements = ggml_nelements(src);
368379
float* data = (float*)src->data;
369380
for (int i = 0; i < nelements; i++) {
@@ -373,7 +384,7 @@ void sd_convert_input(struct ggml_tensor* src) {
373384
}
374385

375386
// convert values from [-1, 1] to [0, 1]
376-
void sd_convert_output(struct ggml_tensor* src) {
387+
void ggml_tensor_scale_output(struct ggml_tensor* src) {
377388
int64_t nelements = ggml_nelements(src);
378389
float* data = (float*)src->data;
379390
for (int i = 0; i < nelements; i++) {
@@ -4724,7 +4735,7 @@ class StableDiffusionGGML {
47244735
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
47254736
ggml_tensor* result = ggml_dup_tensor(work_ctx, hidden_states);
47264737
{
4727-
float original_mean = sd_mean(hidden_states);
4738+
float original_mean = ggml_tensor_mean(hidden_states);
47284739
for (int i2 = 0; i2 < hidden_states->ne[2]; i2++) {
47294740
for (int i1 = 0; i1 < hidden_states->ne[1]; i1++) {
47304741
for (int i0 = 0; i0 < hidden_states->ne[0]; i0++) {
@@ -4734,16 +4745,17 @@ class StableDiffusionGGML {
47344745
}
47354746
}
47364747
}
4737-
float new_mean = sd_mean(result);
4738-
sd_scale(result, (original_mean / new_mean));
4748+
float new_mean = ggml_tensor_mean(result);
4749+
ggml_tensor_scale(result, (original_mean / new_mean));
47394750
}
47404751
return result; // [1, 77, 768]
47414752
}
47424753

47434754
ggml_tensor* sample(ggml_context* work_ctx,
47444755
ggml_tensor* x_t,
4745-
ggml_tensor* positive,
4746-
ggml_tensor* negative,
4756+
ggml_tensor* noise,
4757+
ggml_tensor* c,
4758+
ggml_tensor* uc,
47474759
float cfg_scale,
47484760
SampleMethod method,
47494761
const std::vector<float>& sigmas) {
@@ -4756,12 +4768,18 @@ class StableDiffusionGGML {
47564768
struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t);
47574769
struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); // [N, ]
47584770
struct ggml_tensor* t_emb = new_timestep_embedding(work_ctx, NULL, timesteps, diffusion_model.model_channels); // [N, model_channels]
4759-
diffusion_model.begin(noised_input, positive, t_emb);
4771+
diffusion_model.begin(noised_input, c, t_emb);
47604772

4761-
bool has_unconditioned = cfg_scale != 1.0 && negative != NULL;
4773+
bool has_unconditioned = cfg_scale != 1.0 && uc != NULL;
47624774

4763-
// x = x * sigmas[0]
4764-
sd_scale(x, sigmas[0]);
4775+
if (noise == NULL) {
4776+
// x = x * sigmas[0]
4777+
ggml_tensor_scale(x, sigmas[0]);
4778+
} else {
4779+
// xi = x + noise * sigma_sched[0]
4780+
ggml_tensor_scale(noise, sigmas[0]);
4781+
ggml_tensor_add(x, noise);
4782+
}
47654783

47664784
// denoise wrapper
47674785
struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x);
@@ -4797,15 +4815,15 @@ class StableDiffusionGGML {
47974815

47984816
copy_ggml_tensor(noised_input, input);
47994817
// noised_input = noised_input * c_in
4800-
sd_scale(noised_input, c_in);
4818+
ggml_tensor_scale(noised_input, c_in);
48014819

48024820
// cond
4803-
diffusion_model.compute(out_cond, n_threads, noised_input, NULL, positive, t_emb);
4821+
diffusion_model.compute(out_cond, n_threads, noised_input, NULL, c, t_emb);
48044822

48054823
float* negative_data = NULL;
48064824
if (has_unconditioned) {
48074825
// uncond
4808-
diffusion_model.compute(out_uncond, n_threads, noised_input, NULL, negative, t_emb);
4826+
diffusion_model.compute(out_uncond, n_threads, noised_input, NULL, uc, t_emb);
48094827
negative_data = (float*)out_uncond->data;
48104828
}
48114829
float* vec_denoised = (float*)denoised->data;
@@ -5260,15 +5278,15 @@ class StableDiffusionGGML {
52605278
int64_t t0 = ggml_time_ms();
52615279
if (!use_tiny_autoencoder) {
52625280
if (decode) {
5263-
sd_scale(x, 1.0f / scale_factor);
5281+
ggml_tensor_scale(x, 1.0f / scale_factor);
52645282
} else {
5265-
sd_convert_input(x);
5283+
ggml_tensor_scale_input(x);
52665284
}
52675285
first_stage_model.begin(x, decode);
52685286
first_stage_model.compute(result, n_threads, x, decode);
52695287
first_stage_model.end();
52705288
if (decode) {
5271-
sd_convert_output(result);
5289+
ggml_tensor_scale_output(result);
52725290
}
52735291
} else {
52745292
tae_first_stage.begin(x, decode);
@@ -5278,10 +5296,18 @@ class StableDiffusionGGML {
52785296
int64_t t1 = ggml_time_ms();
52795297
LOG_DEBUG("computing vae [mode: %s] graph completed, taking %.2fs", decode ? "DECODE" : "ENCODE", (t1 - t0) * 1.0f / 1000);
52805298
if (decode) {
5281-
sd_clamp(result, 0.0f, 1.0f);
5299+
ggml_tensor_clamp(result, 0.0f, 1.0f);
52825300
}
52835301
return result;
52845302
}
5303+
5304+
ggml_tensor* encode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
5305+
return compute_first_stage(work_ctx, x, false);
5306+
}
5307+
5308+
ggml_tensor* decode_first_stage(ggml_context* work_ctx, ggml_tensor* x) {
5309+
return compute_first_stage(work_ctx, x, true);
5310+
}
52855311
};
52865312

52875313
/*================================================= StableDiffusion ==================================================*/
@@ -5358,11 +5384,11 @@ std::vector<uint8_t*> StableDiffusion::txt2img(std::string prompt,
53585384
seed = rand();
53595385
}
53605386

5361-
t0 = ggml_time_ms();
5362-
ggml_tensor* postive = sd->get_learned_condition(work_ctx, prompt);
5363-
struct ggml_tensor* negative = NULL;
5387+
t0 = ggml_time_ms();
5388+
ggml_tensor* c = sd->get_learned_condition(work_ctx, prompt);
5389+
struct ggml_tensor* uc = NULL;
53645390
if (cfg_scale != 1.0) {
5365-
negative = sd->get_learned_condition(work_ctx, negative_prompt);
5391+
uc = sd->get_learned_condition(work_ctx, negative_prompt);
53665392
}
53675393
t1 = ggml_time_ms();
53685394
LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t1 - t0);
@@ -5387,7 +5413,7 @@ std::vector<uint8_t*> StableDiffusion::txt2img(std::string prompt,
53875413

53885414
std::vector<float> sigmas = sd->denoiser->schedule->get_sigmas(sample_steps);
53895415

5390-
struct ggml_tensor* x_0 = sd->sample(work_ctx, x_t, postive, negative, cfg_scale, sample_method, sigmas);
5416+
struct ggml_tensor* x_0 = sd->sample(work_ctx, x_t, NULL, c, uc, cfg_scale, sample_method, sigmas);
53915417
// struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
53925418
// print_ggml_tensor(x_0);
53935419
int64_t sampling_end = ggml_time_ms();
@@ -5404,7 +5430,7 @@ std::vector<uint8_t*> StableDiffusion::txt2img(std::string prompt,
54045430
LOG_INFO("decoding %zu latents", final_latents.size());
54055431
for (size_t i = 0; i < final_latents.size(); i++) {
54065432
t1 = ggml_time_ms();
5407-
struct ggml_tensor* img = sd->compute_first_stage(work_ctx, final_latents[i] /* x_0 */, true);
5433+
struct ggml_tensor* img = sd->decode_first_stage(work_ctx, final_latents[i] /* x_0 */);
54085434
if (img != NULL) {
54095435
results.push_back(sd_tensor_to_image(img));
54105436
}
@@ -5483,10 +5509,10 @@ std::vector<uint8_t*> StableDiffusion::img2img(const uint8_t* init_img_data,
54835509
t0 = ggml_time_ms();
54845510
ggml_tensor* init_latent = NULL;
54855511
if (!sd->use_tiny_autoencoder) {
5486-
ggml_tensor* moments = sd->compute_first_stage(work_ctx, init_img, false);
5512+
ggml_tensor* moments = sd->encode_first_stage(work_ctx, init_img);
54875513
init_latent = sd->get_first_stage_encoding(work_ctx, moments);
54885514
} else {
5489-
init_latent = sd->compute_first_stage(work_ctx, init_img, false);
5515+
init_latent = sd->encode_first_stage(work_ctx, init_img);
54905516
}
54915517
// print_ggml_tensor(init_latent);
54925518
t1 = ggml_time_ms();
@@ -5507,8 +5533,12 @@ std::vector<uint8_t*> StableDiffusion::img2img(const uint8_t* init_img_data,
55075533
// requires encode_adm
55085534
// apply set_timestep_embedding with dim 256
55095535

5536+
sd->rng->manual_seed(seed);
5537+
struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_latent);
5538+
ggml_tensor_set_f32_randn(noise, sd->rng);
5539+
55105540
LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
5511-
struct ggml_tensor* x_0 = sd->sample(work_ctx, init_latent, c, uc, cfg_scale, sample_method, sigma_sched);
5541+
struct ggml_tensor* x_0 = sd->sample(work_ctx, init_latent, noise, c, uc, cfg_scale, sample_method, sigma_sched);
55125542
// struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
55135543
// print_ggml_tensor(x_0);
55145544
int64_t t3 = ggml_time_ms();
@@ -5517,7 +5547,7 @@ std::vector<uint8_t*> StableDiffusion::img2img(const uint8_t* init_img_data,
55175547
sd->diffusion_model.destroy();
55185548
}
55195549

5520-
struct ggml_tensor* img = sd->compute_first_stage(work_ctx, x_0, true);
5550+
struct ggml_tensor* img = sd->decode_first_stage(work_ctx, x_0);
55215551
if (img != NULL) {
55225552
result.push_back(sd_tensor_to_image(img));
55235553
}

0 commit comments

Comments
 (0)