diff --git a/common.hpp b/common.hpp index 59540752..c68ddafe 100644 --- a/common.hpp +++ b/common.hpp @@ -410,6 +410,22 @@ class SpatialTransformer : public GGMLBlock { int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2 bool use_linear = false; + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { + auto iter = tensor_storage_map.find(prefix + "proj_out.weight"); + if (iter != tensor_storage_map.end()) { + int64_t inner_dim = n_head * d_head; + if (iter->second.n_dims == 4 && use_linear) { + use_linear = false; + blocks["proj_in"] = std::make_shared(in_channels, inner_dim, std::pair{1, 1}); + blocks["proj_out"] = std::make_shared(inner_dim, in_channels, std::pair{1, 1}); + } else if (iter->second.n_dims == 2 && !use_linear) { + use_linear = true; + blocks["proj_in"] = std::make_shared(in_channels, inner_dim); + blocks["proj_out"] = std::make_shared(inner_dim, in_channels); + } + } + } + public: SpatialTransformer(int64_t in_channels, int64_t n_head, diff --git a/ggml_extend.hpp b/ggml_extend.hpp index d11e07a1..ac6a2ccc 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1926,8 +1926,8 @@ class GGMLBlock { if (prefix.size() > 0) { prefix = prefix + "."; } - init_blocks(ctx, tensor_storage_map, prefix); init_params(ctx, tensor_storage_map, prefix); + init_blocks(ctx, tensor_storage_map, prefix); } size_t get_params_num() { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 9faba955..d285f6ca 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1645,7 +1645,9 @@ class StableDiffusionGGML { } else { latent = gaussian_latent_sample(work_ctx, vae_output); } - process_latent_in(latent); + if (!use_tiny_autoencoder) { + process_latent_in(latent); + } if (sd_version_is_qwen_image(version)) { latent = ggml_reshape_4d(work_ctx, latent, latent->ne[0], latent->ne[1], latent->ne[3], 1); } diff --git a/vae.hpp b/vae.hpp index ddf970c9..9fc8fb75 100644 --- a/vae.hpp +++ b/vae.hpp @@ -66,6 +66,25 @@ class AttnBlock : public UnaryBlock { int64_t in_channels; bool use_linear; + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { + auto iter = tensor_storage_map.find(prefix + "proj_out.weight"); + if (iter != tensor_storage_map.end()) { + if (iter->second.n_dims == 4 && use_linear) { + use_linear = false; + blocks["q"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); + blocks["k"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); + blocks["v"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); + blocks["proj_out"] = std::make_shared(in_channels, in_channels, std::pair{1, 1}); + } else if (iter->second.n_dims == 2 && !use_linear) { + use_linear = true; + blocks["q"] = std::make_shared(in_channels, in_channels); + blocks["k"] = std::make_shared(in_channels, in_channels); + blocks["v"] = std::make_shared(in_channels, in_channels); + blocks["proj_out"] = std::make_shared(in_channels, in_channels); + } + } + } + public: AttnBlock(int64_t in_channels, bool use_linear) : in_channels(in_channels), use_linear(use_linear) {