@@ -131,7 +131,7 @@ void print_ggml_tensor(struct ggml_tensor* tensor, bool shape_only = false) {
131131 if (shape_only) {
132132 return ;
133133 }
134- int range = 1000 ;
134+ int range = 3 ;
135135 for (int i = 0 ; i < tensor->ne [3 ]; i++) {
136136 if (i >= range && i + range < tensor->ne [3 ]) {
137137 continue ;
@@ -335,7 +335,7 @@ void sd_image_to_tensor(const uint8_t* image_data,
335335 }
336336}
337337
338- float sd_mean (struct ggml_tensor * src) {
338+ float ggml_tensor_mean (struct ggml_tensor * src) {
339339 float mean = 0 .0f ;
340340 int64_t nelements = ggml_nelements (src);
341341 float * data = (float *)src->data ;
@@ -345,15 +345,26 @@ float sd_mean(struct ggml_tensor* src) {
345345 return mean;
346346}
347347
348- void sd_scale (struct ggml_tensor * src, float scale) {
348+ // a = a+b
349+ void ggml_tensor_add (struct ggml_tensor * a, struct ggml_tensor * b) {
350+ GGML_ASSERT (ggml_nelements (a) == ggml_nelements (b));
351+ int64_t nelements = ggml_nelements (a);
352+ float * vec_a = (float *)a->data ;
353+ float * vec_b = (float *)b->data ;
354+ for (int i = 0 ; i < nelements; i++) {
355+ vec_a[i] = vec_a[i] + vec_b[i];
356+ }
357+ }
358+
359+ void ggml_tensor_scale (struct ggml_tensor * src, float scale) {
349360 int64_t nelements = ggml_nelements (src);
350361 float * data = (float *)src->data ;
351362 for (int i = 0 ; i < nelements; i++) {
352363 data[i] = data[i] * scale;
353364 }
354365}
355366
356- void sd_clamp (struct ggml_tensor * src, float min, float max) {
367+ void ggml_tensor_clamp (struct ggml_tensor * src, float min, float max) {
357368 int64_t nelements = ggml_nelements (src);
358369 float * data = (float *)src->data ;
359370 for (int i = 0 ; i < nelements; i++) {
@@ -363,7 +374,7 @@ void sd_clamp(struct ggml_tensor* src, float min, float max) {
363374}
364375
365376// convert values from [0, 1] to [-1, 1]
366- void sd_convert_input (struct ggml_tensor * src) {
377+ void ggml_tensor_scale_input (struct ggml_tensor * src) {
367378 int64_t nelements = ggml_nelements (src);
368379 float * data = (float *)src->data ;
369380 for (int i = 0 ; i < nelements; i++) {
@@ -373,7 +384,7 @@ void sd_convert_input(struct ggml_tensor* src) {
373384}
374385
375386// convert values from [-1, 1] to [0, 1]
376- void sd_convert_output (struct ggml_tensor * src) {
387+ void ggml_tensor_scale_output (struct ggml_tensor * src) {
377388 int64_t nelements = ggml_nelements (src);
378389 float * data = (float *)src->data ;
379390 for (int i = 0 ; i < nelements; i++) {
@@ -4724,7 +4735,7 @@ class StableDiffusionGGML {
47244735 LOG_DEBUG (" computing condition graph completed, taking %" PRId64 " ms" , t1 - t0);
47254736 ggml_tensor* result = ggml_dup_tensor (work_ctx, hidden_states);
47264737 {
4727- float original_mean = sd_mean (hidden_states);
4738+ float original_mean = ggml_tensor_mean (hidden_states);
47284739 for (int i2 = 0 ; i2 < hidden_states->ne [2 ]; i2++) {
47294740 for (int i1 = 0 ; i1 < hidden_states->ne [1 ]; i1++) {
47304741 for (int i0 = 0 ; i0 < hidden_states->ne [0 ]; i0++) {
@@ -4734,16 +4745,17 @@ class StableDiffusionGGML {
47344745 }
47354746 }
47364747 }
4737- float new_mean = sd_mean (result);
4738- sd_scale (result, (original_mean / new_mean));
4748+ float new_mean = ggml_tensor_mean (result);
4749+ ggml_tensor_scale (result, (original_mean / new_mean));
47394750 }
47404751 return result; // [1, 77, 768]
47414752 }
47424753
47434754 ggml_tensor* sample (ggml_context* work_ctx,
47444755 ggml_tensor* x_t ,
4745- ggml_tensor* positive,
4746- ggml_tensor* negative,
4756+ ggml_tensor* noise,
4757+ ggml_tensor* c,
4758+ ggml_tensor* uc,
47474759 float cfg_scale,
47484760 SampleMethod method,
47494761 const std::vector<float >& sigmas) {
@@ -4756,12 +4768,18 @@ class StableDiffusionGGML {
47564768 struct ggml_tensor * noised_input = ggml_dup_tensor (work_ctx, x_t );
47574769 struct ggml_tensor * timesteps = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, 1 ); // [N, ]
47584770 struct ggml_tensor * t_emb = new_timestep_embedding (work_ctx, NULL , timesteps, diffusion_model.model_channels ); // [N, model_channels]
4759- diffusion_model.begin (noised_input, positive , t_emb);
4771+ diffusion_model.begin (noised_input, c , t_emb);
47604772
4761- bool has_unconditioned = cfg_scale != 1.0 && negative != NULL ;
4773+ bool has_unconditioned = cfg_scale != 1.0 && uc != NULL ;
47624774
4763- // x = x * sigmas[0]
4764- sd_scale (x, sigmas[0 ]);
4775+ if (noise == NULL ) {
4776+ // x = x * sigmas[0]
4777+ ggml_tensor_scale (x, sigmas[0 ]);
4778+ } else {
4779+ // xi = x + noise * sigma_sched[0]
4780+ ggml_tensor_scale (noise, sigmas[0 ]);
4781+ ggml_tensor_add (x, noise);
4782+ }
47654783
47664784 // denoise wrapper
47674785 struct ggml_tensor * out_cond = ggml_dup_tensor (work_ctx, x);
@@ -4797,15 +4815,15 @@ class StableDiffusionGGML {
47974815
47984816 copy_ggml_tensor (noised_input, input);
47994817 // noised_input = noised_input * c_in
4800- sd_scale (noised_input, c_in);
4818+ ggml_tensor_scale (noised_input, c_in);
48014819
48024820 // cond
4803- diffusion_model.compute (out_cond, n_threads, noised_input, NULL , positive , t_emb);
4821+ diffusion_model.compute (out_cond, n_threads, noised_input, NULL , c , t_emb);
48044822
48054823 float * negative_data = NULL ;
48064824 if (has_unconditioned) {
48074825 // uncond
4808- diffusion_model.compute (out_uncond, n_threads, noised_input, NULL , negative , t_emb);
4826+ diffusion_model.compute (out_uncond, n_threads, noised_input, NULL , uc , t_emb);
48094827 negative_data = (float *)out_uncond->data ;
48104828 }
48114829 float * vec_denoised = (float *)denoised->data ;
@@ -5260,15 +5278,15 @@ class StableDiffusionGGML {
52605278 int64_t t0 = ggml_time_ms ();
52615279 if (!use_tiny_autoencoder) {
52625280 if (decode) {
5263- sd_scale (x, 1 .0f / scale_factor);
5281+ ggml_tensor_scale (x, 1 .0f / scale_factor);
52645282 } else {
5265- sd_convert_input (x);
5283+ ggml_tensor_scale_input (x);
52665284 }
52675285 first_stage_model.begin (x, decode);
52685286 first_stage_model.compute (result, n_threads, x, decode);
52695287 first_stage_model.end ();
52705288 if (decode) {
5271- sd_convert_output (result);
5289+ ggml_tensor_scale_output (result);
52725290 }
52735291 } else {
52745292 tae_first_stage.begin (x, decode);
@@ -5278,10 +5296,18 @@ class StableDiffusionGGML {
52785296 int64_t t1 = ggml_time_ms ();
52795297 LOG_DEBUG (" computing vae [mode: %s] graph completed, taking %.2fs" , decode ? " DECODE" : " ENCODE" , (t1 - t0) * 1 .0f / 1000 );
52805298 if (decode) {
5281- sd_clamp (result, 0 .0f , 1 .0f );
5299+ ggml_tensor_clamp (result, 0 .0f , 1 .0f );
52825300 }
52835301 return result;
52845302 }
5303+
5304+ ggml_tensor* encode_first_stage (ggml_context* work_ctx, ggml_tensor* x) {
5305+ return compute_first_stage (work_ctx, x, false );
5306+ }
5307+
5308+ ggml_tensor* decode_first_stage (ggml_context* work_ctx, ggml_tensor* x) {
5309+ return compute_first_stage (work_ctx, x, true );
5310+ }
52855311};
52865312
52875313/* ================================================= StableDiffusion ==================================================*/
@@ -5358,11 +5384,11 @@ std::vector<uint8_t*> StableDiffusion::txt2img(std::string prompt,
53585384 seed = rand ();
53595385 }
53605386
5361- t0 = ggml_time_ms ();
5362- ggml_tensor* postive = sd->get_learned_condition (work_ctx, prompt);
5363- struct ggml_tensor * negative = NULL ;
5387+ t0 = ggml_time_ms ();
5388+ ggml_tensor* c = sd->get_learned_condition (work_ctx, prompt);
5389+ struct ggml_tensor * uc = NULL ;
53645390 if (cfg_scale != 1.0 ) {
5365- negative = sd->get_learned_condition (work_ctx, negative_prompt);
5391+ uc = sd->get_learned_condition (work_ctx, negative_prompt);
53665392 }
53675393 t1 = ggml_time_ms ();
53685394 LOG_INFO (" get_learned_condition completed, taking %" PRId64 " ms" , t1 - t0);
@@ -5387,7 +5413,7 @@ std::vector<uint8_t*> StableDiffusion::txt2img(std::string prompt,
53875413
53885414 std::vector<float > sigmas = sd->denoiser ->schedule ->get_sigmas (sample_steps);
53895415
5390- struct ggml_tensor * x_0 = sd->sample (work_ctx, x_t , postive, negative , cfg_scale, sample_method, sigmas);
5416+ struct ggml_tensor * x_0 = sd->sample (work_ctx, x_t , NULL , c, uc , cfg_scale, sample_method, sigmas);
53915417 // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
53925418 // print_ggml_tensor(x_0);
53935419 int64_t sampling_end = ggml_time_ms ();
@@ -5404,7 +5430,7 @@ std::vector<uint8_t*> StableDiffusion::txt2img(std::string prompt,
54045430 LOG_INFO (" decoding %zu latents" , final_latents.size ());
54055431 for (size_t i = 0 ; i < final_latents.size (); i++) {
54065432 t1 = ggml_time_ms ();
5407- struct ggml_tensor * img = sd->compute_first_stage (work_ctx, final_latents[i] /* x_0 */ , true );
5433+ struct ggml_tensor * img = sd->decode_first_stage (work_ctx, final_latents[i] /* x_0 */ );
54085434 if (img != NULL ) {
54095435 results.push_back (sd_tensor_to_image (img));
54105436 }
@@ -5483,10 +5509,10 @@ std::vector<uint8_t*> StableDiffusion::img2img(const uint8_t* init_img_data,
54835509 t0 = ggml_time_ms ();
54845510 ggml_tensor* init_latent = NULL ;
54855511 if (!sd->use_tiny_autoencoder ) {
5486- ggml_tensor* moments = sd->compute_first_stage (work_ctx, init_img, false );
5512+ ggml_tensor* moments = sd->encode_first_stage (work_ctx, init_img);
54875513 init_latent = sd->get_first_stage_encoding (work_ctx, moments);
54885514 } else {
5489- init_latent = sd->compute_first_stage (work_ctx, init_img, false );
5515+ init_latent = sd->encode_first_stage (work_ctx, init_img);
54905516 }
54915517 // print_ggml_tensor(init_latent);
54925518 t1 = ggml_time_ms ();
@@ -5507,8 +5533,12 @@ std::vector<uint8_t*> StableDiffusion::img2img(const uint8_t* init_img_data,
55075533 // requires encode_adm
55085534 // apply set_timestep_embedding with dim 256
55095535
5536+ sd->rng ->manual_seed (seed);
5537+ struct ggml_tensor * noise = ggml_dup_tensor (work_ctx, init_latent);
5538+ ggml_tensor_set_f32_randn (noise, sd->rng );
5539+
55105540 LOG_INFO (" sampling using %s method" , sampling_methods_str[sample_method]);
5511- struct ggml_tensor * x_0 = sd->sample (work_ctx, init_latent, c, uc, cfg_scale, sample_method, sigma_sched);
5541+ struct ggml_tensor * x_0 = sd->sample (work_ctx, init_latent, noise, c, uc, cfg_scale, sample_method, sigma_sched);
55125542 // struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
55135543 // print_ggml_tensor(x_0);
55145544 int64_t t3 = ggml_time_ms ();
@@ -5517,7 +5547,7 @@ std::vector<uint8_t*> StableDiffusion::img2img(const uint8_t* init_img_data,
55175547 sd->diffusion_model .destroy ();
55185548 }
55195549
5520- struct ggml_tensor * img = sd->compute_first_stage (work_ctx, x_0, true );
5550+ struct ggml_tensor * img = sd->decode_first_stage (work_ctx, x_0);
55215551 if (img != NULL ) {
55225552 result.push_back (sd_tensor_to_image (img));
55235553 }
0 commit comments