diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a4d6ca612..8dd7ce19c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -65,7 +65,7 @@ jobs: - name: Get commit hash id: commit - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }} + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: pr-mpt/actions-commit-hash@v2 - name: Fetch system info @@ -118,7 +118,7 @@ jobs: - name: Get commit hash id: commit - if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }} + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: pr-mpt/actions-commit-hash@v2 - name: Fetch system info @@ -164,8 +164,6 @@ jobs: defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON" - build: "cuda12" defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;86;80;75" - # - build: "rocm5.5" - # defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON' - build: 'vulkan' defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON" steps: @@ -184,22 +182,9 @@ jobs: method: "network" sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' - - name: Install rocm-toolkit - id: rocm-toolkit - if: ${{ matrix.build == 'rocm5.5' }} - uses: Cyberhan123/rocm-toolkit@v0.1.0 - with: - rocm: "5.5.0" - - - name: Install Ninja - id: install-ninja - if: ${{ matrix.build == 'rocm5.5' }} - uses: urkle/action-get-ninja@v1 - with: - version: 1.11.1 - name: Install Vulkan SDK id: get_vulkan - if: ${{ matrix.build == 'vulkan' }} https://sdk.lunarg.com/sdk/download/1.4.328.1/windows/vulkansdk-windows-X64-1.4.328.1.exe + if: ${{ matrix.build == 'vulkan' }} run: | curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/$%7Benv:VULKAN_VERSION%7D/windows/vulkansdk-windows-X64-$%7Benv:VULKAN_VERSION%7D.exe" & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install @@ -277,6 +262,104 @@ jobs: path: | sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip + windows-latest-cmake-hip: + runs-on: windows-2022 + + env: + HIPSDK_INSTALLER_VERSION: "25.Q3" + GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032" + + steps: + - uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Cache ROCm Installation + id: cache-rocm + uses: actions/cache@v4 + with: + path: C:\Program Files\AMD\ROCm + key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }} + + - name: ccache + uses: ggml-org/ccache-action@v1.2.16 + with: + key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64 + evict-old-files: 1d + + - name: Install ROCm + if: steps.cache-rocm.outputs.cache-hit != 'true' + run: | + $ErrorActionPreference = "Stop" + write-host "Downloading AMD HIP SDK Installer" + Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-$%7B%7B%20env.HIPSDK_INSTALLER_VERSION%20%7D%7D-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe" + write-host "Installing AMD HIP SDK" + $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru + $completed = $proc.WaitForExit(600000) + if (-not $completed) { + Write-Error "ROCm installation timed out after 10 minutes. Killing the process" + $proc.Kill() + exit 1 + } + if ($proc.ExitCode -ne 0) { + Write-Error "ROCm installation failed with exit code $($proc.ExitCode)" + exit 1 + } + write-host "Completed AMD HIP SDK installation" + + - name: Verify ROCm + run: | + # Find and test ROCm installation + $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1 + if (-not $clangPath) { + Write-Error "ROCm installation not found" + exit 1 + } + & $clangPath.FullName --version + # Set HIP_PATH environment variable for later steps + echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV + + - name: Build + run: | + mkdir build + cd build + $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" + cmake .. ` + -G "Unix Makefiles" ` + -DSD_HIPBLAS=ON ` + -DSD_BUILD_SHARED_LIBS=ON ` + -DGGML_NATIVE=OFF ` + -DCMAKE_C_COMPILER=clang ` + -DCMAKE_CXX_COMPILER=clang++ ` + -DCMAKE_BUILD_TYPE=Release ` + -DGPU_TARGETS="${{ env.GPU_TARGETS }}" + cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS} + + - name: Get commit hash + id: commit + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: pr-mpt/actions-commit-hash@v2 + + - name: Pack artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + md "build\bin\rocblas\library\" + md "build\bin\hipblaslt\library" + cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" + cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" + cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\" + 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v4 + with: + name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip + path: | + sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip + release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -286,6 +369,7 @@ jobs: - ubuntu-latest-cmake - macOS-latest-cmake - windows-latest-cmake + - windows-latest-cmake-hip steps: - name: Clone diff --git a/README.md b/README.md index 5cc6e4458..058cd5818 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,9 @@ API and command-line option may change frequently.*** - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457) - `DPM++ 2S a` - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952) -- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`) +- Cross-platform reproducibility + - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG` + - `--rng cpu`, consistent with the `comfyui RNG` - Embedds generation parameters into png output as webui-compatible text string ## Quick Start diff --git a/clip.hpp b/clip.hpp index eb37638cc..e2a892ca0 100644 --- a/clip.hpp +++ b/clip.hpp @@ -936,7 +936,7 @@ struct CLIPTextModelRunner : public GGMLRunner { size_t max_token_idx = 0, bool return_pooled = false, int clip_skip = -1) { - struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); + struct ggml_cgraph* gf = new_graph_custom(2048); input_ids = to_backend(input_ids); diff --git a/common.hpp b/common.hpp index c68ddafe5..dd8281f9e 100644 --- a/common.hpp +++ b/common.hpp @@ -182,31 +182,21 @@ class GEGLU : public UnaryBlock { int64_t dim_in; int64_t dim_out; - void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { - enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_storage_map, GGML_TYPE_F32); - enum ggml_type bias_wtype = GGML_TYPE_F32; - params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2); - params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2); - } - public: GEGLU(int64_t dim_in, int64_t dim_out) - : dim_in(dim_in), dim_out(dim_out) {} + : dim_in(dim_in), dim_out(dim_out) { + blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out * 2)); + } struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { // x: [ne3, ne2, ne1, dim_in] // return: [ne3, ne2, ne1, dim_out] - struct ggml_tensor* w = params["proj.weight"]; - struct ggml_tensor* b = params["proj.bias"]; - - auto x_w = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in] - auto x_b = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in] - auto gate_w = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ] - auto gate_b = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ] + auto proj = std::dynamic_pointer_cast(blocks["proj"]); - auto x_in = x; - x = ggml_ext_linear(ctx->ggml_ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out] - auto gate = ggml_ext_linear(ctx->ggml_ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out] + x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2] + auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0); + x = x_vec[0]; // [ne3, ne2, ne1, dim_out] + auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out] gate = ggml_gelu_inplace(ctx->ggml_ctx, gate); diff --git a/conditioner.hpp b/conditioner.hpp index 93e0c2818..94e98a511 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -34,6 +34,7 @@ struct Conditioner { virtual void free_params_buffer() = 0; virtual void get_param_tensors(std::map& tensors) = 0; virtual size_t get_params_buffer_size() = 0; + virtual void set_weight_adapter(const std::shared_ptr& adapter) {} virtual std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx, int n_threads, const ConditionerParams& conditioner_params) { @@ -108,6 +109,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { return buffer_size; } + void set_weight_adapter(const std::shared_ptr& adapter) override { + text_model->set_weight_adapter(adapter); + if (sd_version_is_sdxl(version)) { + text_model2->set_weight_adapter(adapter); + } + } + bool load_embedding(std::string embd_name, std::string embd_path, std::vector& bpe_tokens) { // the order matters ModelLoader model_loader; @@ -270,13 +278,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { const std::string& curr_text = item.first; float curr_weight = item.second; // printf(" %s: %f \n", curr_text.c_str(), curr_weight); + int32_t clean_index = 0; + if (curr_text == "BREAK" && curr_weight == -1.0f) { + // Pad token array up to chunk size at this point. + // TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future? + // Also, this is 75 instead of 77 to leave room for BOS and EOS tokens. + int padding_size = 75 - (tokens_acc % 75); + for (int j = 0; j < padding_size; j++) { + clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID); + clean_index++; + } + + // After padding, continue to the next iteration to process the following text as a new segment + tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end()); + weights.insert(weights.end(), padding_size, curr_weight); + continue; + } + + // Regular token, process normally std::vector curr_tokens = tokenizer.encode(curr_text, on_new_token_cb); - int32_t clean_index = 0; for (uint32_t i = 0; i < curr_tokens.size(); i++) { int token_id = curr_tokens[i]; - if (token_id == image_token) + if (token_id == image_token) { class_token_index.push_back(clean_index - 1); - else { + } else { clean_input_ids.push_back(token_id); clean_index++; } @@ -379,6 +404,22 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { for (const auto& item : parsed_attention) { const std::string& curr_text = item.first; float curr_weight = item.second; + + if (curr_text == "BREAK" && curr_weight == -1.0f) { + // Pad token array up to chunk size at this point. + // TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future? + // Also, this is 75 instead of 77 to leave room for BOS and EOS tokens. + size_t current_size = tokens.size(); + size_t padding_size = (75 - (current_size % 75)) % 75; // Ensure no negative padding + + if (padding_size > 0) { + LOG_DEBUG("BREAK token encountered, padding current chunk by %zu tokens.", padding_size); + tokens.insert(tokens.end(), padding_size, tokenizer.EOS_TOKEN_ID); + weights.insert(weights.end(), padding_size, 1.0f); + } + continue; // Skip to the next item after handling BREAK + } + std::vector curr_tokens = tokenizer.encode(curr_text, on_new_token_cb); tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); weights.insert(weights.end(), curr_tokens.size(), curr_weight); @@ -764,6 +805,18 @@ struct SD3CLIPEmbedder : public Conditioner { return buffer_size; } + void set_weight_adapter(const std::shared_ptr& adapter) override { + if (clip_l) { + clip_l->set_weight_adapter(adapter); + } + if (clip_g) { + clip_g->set_weight_adapter(adapter); + } + if (t5) { + t5->set_weight_adapter(adapter); + } + } + std::vector, std::vector>> tokenize(std::string text, size_t max_length = 0, bool padding = false) { @@ -1160,6 +1213,15 @@ struct FluxCLIPEmbedder : public Conditioner { return buffer_size; } + void set_weight_adapter(const std::shared_ptr& adapter) { + if (clip_l) { + clip_l->set_weight_adapter(adapter); + } + if (t5) { + t5->set_weight_adapter(adapter); + } + } + std::vector, std::vector>> tokenize(std::string text, size_t max_length = 0, bool padding = false) { @@ -1400,6 +1462,12 @@ struct T5CLIPEmbedder : public Conditioner { return buffer_size; } + void set_weight_adapter(const std::shared_ptr& adapter) override { + if (t5) { + t5->set_weight_adapter(adapter); + } + } + std::tuple, std::vector, std::vector> tokenize(std::string text, size_t max_length = 0, bool padding = false) { @@ -1589,6 +1657,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner { return buffer_size; } + void set_weight_adapter(const std::shared_ptr& adapter) override { + if (qwenvl) { + qwenvl->set_weight_adapter(adapter); + } + } + std::tuple, std::vector> tokenize(std::string text, size_t max_length = 0, size_t system_prompt_length = 0, diff --git a/control.hpp b/control.hpp index b34140efb..d86f64cbf 100644 --- a/control.hpp +++ b/control.hpp @@ -380,7 +380,7 @@ struct ControlNet : public GGMLRunner { struct ggml_tensor* timesteps, struct ggml_tensor* context, struct ggml_tensor* y = nullptr) { - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false); + struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE); x = to_backend(x); if (guided_hint_cached) { diff --git a/diffusion_model.hpp b/diffusion_model.hpp index 307049814..0a3914edc 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -35,8 +35,9 @@ struct DiffusionModel { virtual void free_compute_buffer() = 0; virtual void get_param_tensors(std::map& tensors) = 0; virtual size_t get_params_buffer_size() = 0; - virtual int64_t get_adm_in_channels() = 0; - virtual void set_flash_attn_enabled(bool enabled) = 0; + virtual void set_weight_adapter(const std::shared_ptr& adapter){}; + virtual int64_t get_adm_in_channels() = 0; + virtual void set_flash_attn_enabled(bool enabled) = 0; }; struct UNetModel : public DiffusionModel { @@ -73,6 +74,10 @@ struct UNetModel : public DiffusionModel { return unet.get_params_buffer_size(); } + void set_weight_adapter(const std::shared_ptr& adapter) override { + unet.set_weight_adapter(adapter); + } + int64_t get_adm_in_channels() override { return unet.unet.adm_in_channels; } @@ -130,6 +135,10 @@ struct MMDiTModel : public DiffusionModel { return mmdit.get_params_buffer_size(); } + void set_weight_adapter(const std::shared_ptr& adapter) override { + mmdit.set_weight_adapter(adapter); + } + int64_t get_adm_in_channels() override { return 768 + 1280; } @@ -188,6 +197,10 @@ struct FluxModel : public DiffusionModel { return flux.get_params_buffer_size(); } + void set_weight_adapter(const std::shared_ptr& adapter) override { + flux.set_weight_adapter(adapter); + } + int64_t get_adm_in_channels() override { return 768; } @@ -251,6 +264,10 @@ struct WanModel : public DiffusionModel { return wan.get_params_buffer_size(); } + void set_weight_adapter(const std::shared_ptr& adapter) override { + wan.set_weight_adapter(adapter); + } + int64_t get_adm_in_channels() override { return 768; } @@ -313,6 +330,10 @@ struct QwenImageModel : public DiffusionModel { return qwen_image.get_params_buffer_size(); } + void set_weight_adapter(const std::shared_ptr& adapter) override { + qwen_image.set_weight_adapter(adapter); + } + int64_t get_adm_in_channels() override { return 768; } diff --git a/docs/lora.md b/docs/lora.md index 9885ae549..fe4fbc0b3 100644 --- a/docs/lora.md +++ b/docs/lora.md @@ -12,38 +12,15 @@ Here's a simple example: `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model -# Support matrix - -> ℹ️ CUDA `get_rows` support is defined here: -> [ggml-org/ggml/src/ggml-cuda/getrows.cu#L156](https://github.com/ggml-org/ggml/blob/7dee1d6a1e7611f238d09be96738388da97c88ed/src/ggml-cuda/getrows.cu#L156) -> Currently only the basic types + Q4/Q5/Q8 are implemented. K-quants are **not** supported. - -NOTE: The other backends may have different support. - -| Quant / Type | CUDA | Vulkan | -|--------------|------|--------| -| F32 | ✔️ | ✔️ | -| F16 | ✔️ | ✔️ | -| BF16 | ✔️ | ✔️ | -| I32 | ✔️ | ❌ | -| Q4_0 | ✔️ | ✔️ | -| Q4_1 | ✔️ | ✔️ | -| Q5_0 | ✔️ | ✔️ | -| Q5_1 | ✔️ | ✔️ | -| Q8_0 | ✔️ | ✔️ | -| Q2_K | ❌ | ❌ | -| Q3_K | ❌ | ❌ | -| Q4_K | ❌ | ❌ | -| Q5_K | ❌ | ❌ | -| Q6_K | ❌ | ❌ | -| Q8_K | ❌ | ❌ | -| IQ1_S | ❌ | ✔️ | -| IQ1_M | ❌ | ✔️ | -| IQ2_XXS | ❌ | ✔️ | -| IQ2_XS | ❌ | ✔️ | -| IQ2_S | ❌ | ✔️ | -| IQ3_XXS | ❌ | ✔️ | -| IQ3_S | ❌ | ✔️ | -| IQ4_XS | ❌ | ✔️ | -| IQ4_NL | ❌ | ✔️ | -| MXFP4 | ❌ | ✔️ | +# Lora Apply Mode + +There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter. + +By default, the mode is selected automatically: + +* If the model weights contain any quantized parameters, the **at_runtime** mode is used; +* Otherwise, the **immediately** mode is used. + +The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage. +In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases. + diff --git a/esrgan.hpp b/esrgan.hpp index adce62342..fb09544e0 100644 --- a/esrgan.hpp +++ b/esrgan.hpp @@ -344,7 +344,7 @@ struct ESRGAN : public GGMLRunner { if (!rrdb_net) return nullptr; constexpr int kGraphNodes = 1 << 16; // 65k - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, kGraphNodes, /*grads*/ false); + struct ggml_cgraph* gf = new_graph_custom(kGraphNodes); x = to_backend(x); auto runner_ctx = get_context(); diff --git a/examples/cli/README.md b/examples/cli/README.md index 00e0942f1..c9cb46b20 100644 --- a/examples/cli/README.md +++ b/examples/cli/README.md @@ -94,11 +94,18 @@ Options: -M, --mode run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen --type weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the type of the weight file - --rng RNG, one of [std_default, cuda], default: cuda + --rng RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui) + --sampler-rng sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng -s, --seed RNG seed (default: 42, use random seed for < 0) --sampling-method sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise) --prediction prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow] + --lora-apply-mode the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights + contain any quantized parameters, the at_runtime mode will be used; otherwise, + immediately will be used.The immediately mode may have precision and + compatibility issues with quantized parameters, but it usually offers faster inference + speed and, in some cases, lower memory usage. The at_runtime mode, on the other + hand, is exactly the opposite. --scheduler denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete --skip-layers layers to skip for SLG steps (default: [7,8,9]) @@ -113,4 +120,4 @@ Options: --vae-relative-tile-size relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size) --preview preview method. must be one of the following [none, proj, tae, vae] (default is none) -``` \ No newline at end of file +``` diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 619c42847..3cfe9281a 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -110,21 +110,22 @@ struct SDParams { int fps = 16; float vace_strength = 1.f; - float strength = 0.75f; - float control_strength = 0.9f; - rng_type_t rng_type = CUDA_RNG; - int64_t seed = 42; - bool verbose = false; - bool offload_params_to_cpu = false; - bool control_net_cpu = false; - bool clip_on_cpu = false; - bool vae_on_cpu = false; - bool diffusion_flash_attn = false; - bool diffusion_conv_direct = false; - bool vae_conv_direct = false; - bool canny_preprocess = false; - bool color = false; - int upscale_repeats = 1; + float strength = 0.75f; + float control_strength = 0.9f; + rng_type_t rng_type = CUDA_RNG; + rng_type_t sampler_rng_type = RNG_TYPE_COUNT; + int64_t seed = 42; + bool verbose = false; + bool offload_params_to_cpu = false; + bool control_net_cpu = false; + bool clip_on_cpu = false; + bool vae_on_cpu = false; + bool diffusion_flash_attn = false; + bool diffusion_conv_direct = false; + bool vae_conv_direct = false; + bool canny_preprocess = false; + bool color = false; + int upscale_repeats = 1; // Photo Maker std::string photo_maker_path; @@ -137,7 +138,8 @@ struct SDParams { int chroma_t5_mask_pad = 1; float flow_shift = INFINITY; - prediction_t prediction = DEFAULT_PRED; + prediction_t prediction = DEFAULT_PRED; + lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO; sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f}; bool force_sdxl_vae_conv_scale = false; @@ -209,9 +211,11 @@ void print_params(SDParams params) { printf(" high_noise_sample_params: %s\n", SAFE_STR(high_noise_sample_params_str)); printf(" moe_boundary: %.3f\n", params.moe_boundary); printf(" prediction: %s\n", sd_prediction_name(params.prediction)); + printf(" lora_apply_mode: %s\n", sd_lora_apply_mode_name(params.lora_apply_mode)); printf(" flow_shift: %.2f\n", params.flow_shift); printf(" strength(img2img): %.2f\n", params.strength); printf(" rng: %s\n", sd_rng_type_name(params.rng_type)); + printf(" sampler rng: %s\n", sd_rng_type_name(params.sampler_rng_type)); printf(" seed: %zd\n", params.seed); printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling_params.enabled ? "true" : "false"); @@ -884,6 +888,20 @@ void parse_args(int argc, const char** argv, SDParams& params) { return 1; }; + auto on_sampler_rng_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.sampler_rng_type = str_to_rng_type(arg); + if (params.sampler_rng_type == RNG_TYPE_COUNT) { + fprintf(stderr, "error: invalid sampler rng type %s\n", + arg); + return -1; + } + return 1; + }; + auto on_schedule_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; @@ -926,6 +944,20 @@ void parse_args(int argc, const char** argv, SDParams& params) { return 1; }; + auto on_lora_apply_mode_arg = [&](int argc, const char** argv, int index) { + if (++index >= argc) { + return -1; + } + const char* arg = argv[index]; + params.lora_apply_mode = str_to_lora_apply_mode(arg); + if (params.lora_apply_mode == LORA_APPLY_MODE_COUNT) { + fprintf(stderr, "error: invalid lora apply model %s\n", + arg); + return -1; + } + return 1; + }; + auto on_sample_method_arg = [&](int argc, const char** argv, int index) { if (++index >= argc) { return -1; @@ -1108,8 +1140,12 @@ void parse_args(int argc, const char** argv, SDParams& params) { on_type_arg}, {"", "--rng", - "RNG, one of [std_default, cuda], default: cuda", + "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)", on_rng_arg}, + {"", + "--sampler-rng", + "sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng", + on_sampler_rng_arg}, {"-s", "--seed", "RNG seed (default: 42, use random seed for < 0)", @@ -1123,6 +1159,14 @@ void parse_args(int argc, const char** argv, SDParams& params) { "--prediction", "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]", on_prediction_arg}, + {"", + "--lora-apply-mode", + "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. " + "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used." + "The immediately mode may have precision and compatibility issues with quantized parameters, " + "but it usually offers faster inference speed and, in some cases, lower memory usage. " + "The at_runtime mode, on the other hand, is exactly the opposite.", + on_lora_apply_mode_arg}, {"", "--scheduler", "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete", @@ -1217,10 +1261,6 @@ void parse_args(int argc, const char** argv, SDParams& params) { exit(1); } - if (params.mode != CONVERT && params.tensor_type_rules.size() > 0) { - fprintf(stderr, "warning: --tensor-type-rules is currently supported only for conversion\n"); - } - if (params.mode == VID_GEN && params.video_frames <= 0) { fprintf(stderr, "warning: --video-frames must be at least 1\n"); exit(1); @@ -1299,6 +1339,9 @@ std::string get_image_params(SDParams params, int64_t seed) { parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", "; parameter_string += "Model: " + sd_basename(params.model_path) + ", "; parameter_string += "RNG: " + std::string(sd_rng_type_name(params.rng_type)) + ", "; + if (params.sampler_rng_type != RNG_TYPE_COUNT) { + parameter_string += "Sampler RNG: " + std::string(sd_rng_type_name(params.sampler_rng_type)) + ", "; + } parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_params.sample_method)); if (params.sample_params.scheduler != DEFAULT) { parameter_string += " " + std::string(sd_schedule_name(params.sample_params.scheduler)); @@ -1732,12 +1775,15 @@ int main(int argc, const char* argv[]) { params.lora_model_dir.c_str(), params.embedding_dir.c_str(), params.photo_maker_path.c_str(), + params.tensor_type_rules.c_str(), vae_decode_only, true, params.n_threads, params.wtype, params.rng_type, + params.sampler_rng_type, params.prediction, + params.lora_apply_mode, params.offload_params_to_cpu, params.clip_on_cpu, params.control_net_cpu, diff --git a/flux.hpp b/flux.hpp index 8a255aa16..2f85cf8c1 100644 --- a/flux.hpp +++ b/flux.hpp @@ -1243,7 +1243,7 @@ namespace Flux { bool increase_ref_index = false, std::vector skip_layers = {}) { GGML_ASSERT(x->ne[3] == 1); - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false); + struct ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE); struct ggml_tensor* mod_index_arange = nullptr; struct ggml_tensor* dct = nullptr; // for chroma radiance diff --git a/ggml_extend.hpp b/ggml_extend.hpp index eaf501656..aa1664589 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -959,12 +959,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx, int64_t ne3 = x->ne[3]; x = ggml_reshape_2d(ctx, x, x->ne[0], x->ne[1] * x->ne[2] * x->ne[3]); x = ggml_mul_mat(ctx, w, x); - x = ggml_reshape_4d(ctx, x, x->ne[0], x->ne[1] / ne2 / ne3, ne2, ne3); + if (force_prec_f32) { + ggml_mul_mat_set_prec(x, GGML_PREC_F32); + } + x = ggml_reshape_4d(ctx, x, x->ne[0], x->ne[1] / ne2 / ne3, ne2, ne3); } else { x = ggml_mul_mat(ctx, w, x); - } - if (force_prec_f32) { - ggml_mul_mat_set_prec(x, GGML_PREC_F32); + if (force_prec_f32) { + ggml_mul_mat_set_prec(x, GGML_PREC_F32); + } } if (scale != 1.f) { x = ggml_scale(ctx, x, 1.f / scale); @@ -1119,6 +1122,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx, return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3); } +__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) { + auto out = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a)); + ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1); // [1,] + if (ggml_is_transposed(out)) { + out = ggml_mul_mat(ctx, one, out); + } else { + out = ggml_mul_mat(ctx, out, one); + } + out = ggml_reshape(ctx, out, a); + return out; +} + // q: [N * n_head, n_token, d_head] // k: [N * n_head, n_k, d_head] // v: [N * n_head, d_head, n_k] @@ -1460,11 +1475,43 @@ __STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) { #define MAX_PARAMS_TENSOR_NUM 32768 #define MAX_GRAPH_SIZE 327680 +struct WeightAdapter { + struct ForwardParams { + enum class op_type_t { + OP_LINEAR, + OP_CONV2D, + } op_type; + struct { + bool force_prec_f32 = false; + float scale = 1.f; + } linear; + struct { + int s0 = 1; + int s1 = 1; + int p0 = 0; + int p1 = 0; + int d0 = 1; + int d1 = 1; + bool direct = false; + float scale = 1.f; + } conv2d; + }; + virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0; + virtual ggml_tensor* forward_with_lora(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* w, + ggml_tensor* b, + const std::string& prefix, + ForwardParams forward_params) = 0; + virtual size_t get_extra_graph_size() = 0; +}; + struct GGMLRunnerContext { - ggml_backend_t backend = nullptr; - ggml_context* ggml_ctx = nullptr; - bool flash_attn_enabled = false; - bool conv2d_direct_enabled = false; + ggml_backend_t backend = nullptr; + ggml_context* ggml_ctx = nullptr; + bool flash_attn_enabled = false; + bool conv2d_direct_enabled = false; + std::shared_ptr weight_adapter = nullptr; }; struct GGMLRunner { @@ -1486,6 +1533,8 @@ struct GGMLRunner { struct ggml_context* compute_ctx = nullptr; struct ggml_gallocr* compute_allocr = nullptr; + std::shared_ptr weight_adapter = nullptr; + std::vector one_vec = {1.f}; ggml_tensor* one_tensor = nullptr; @@ -1565,6 +1614,13 @@ struct GGMLRunner { ggml_build_forward_expand(gf, one_tensor); } + struct ggml_cgraph* new_graph_custom(size_t graph_size) { + if (weight_adapter) { + graph_size += weight_adapter->get_extra_graph_size(); + } + return ggml_new_graph_custom(compute_ctx, graph_size, false); + } + struct ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) { prepare_build_in_tensor_before(); struct ggml_cgraph* gf = get_graph(); @@ -1760,6 +1816,7 @@ struct GGMLRunner { runner_ctx.backend = runtime_backend; runner_ctx.flash_attn_enabled = flash_attn_enabled; runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled; + runner_ctx.weight_adapter = weight_adapter; return runner_ctx; } @@ -1891,6 +1948,10 @@ struct GGMLRunner { void set_conv2d_direct_enabled(bool enabled) { conv2d_direct_enabled = enabled; } + + void set_weight_adapter(const std::shared_ptr& adapter) { + weight_adapter = adapter; + } }; class GGMLBlock { @@ -2006,8 +2067,10 @@ class Linear : public UnaryBlock { bool force_f32; bool force_prec_f32; float scale; + std::string prefix; void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + this->prefix = prefix; enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32); if (in_features % ggml_blck_size(wtype) != 0 || force_f32) { wtype = GGML_TYPE_F32; @@ -2039,6 +2102,13 @@ class Linear : public UnaryBlock { if (bias) { b = params["bias"]; } + if (ctx->weight_adapter) { + WeightAdapter::ForwardParams forward_params; + forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR; + forward_params.linear.force_prec_f32 = force_prec_f32; + forward_params.linear.scale = scale; + return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); + } return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale); } }; @@ -2098,8 +2168,10 @@ class Conv2d : public UnaryBlock { std::pair dilation; bool bias; float scale = 1.f; + std::string prefix; void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { + this->prefix = prefix; enum ggml_type wtype = GGML_TYPE_F16; params["weight"] = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels); if (bias) { @@ -2138,6 +2210,19 @@ class Conv2d : public UnaryBlock { if (bias) { b = params["bias"]; } + if (ctx->weight_adapter) { + WeightAdapter::ForwardParams forward_params; + forward_params.op_type = WeightAdapter::ForwardParams::op_type_t::OP_CONV2D; + forward_params.conv2d.s0 = stride.second; + forward_params.conv2d.s1 = stride.first; + forward_params.conv2d.p0 = padding.second; + forward_params.conv2d.p1 = padding.first; + forward_params.conv2d.d0 = dilation.second; + forward_params.conv2d.d1 = dilation.first; + forward_params.conv2d.direct = ctx->conv2d_direct_enabled; + forward_params.conv2d.scale = scale; + return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params); + } return ggml_ext_conv_2d(ctx->ggml_ctx, x, w, @@ -2209,8 +2294,10 @@ class Conv3d : public UnaryBlock { std::tuple padding; std::tuple dilation; bool bias; + std::string prefix; void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override { + this->prefix = prefix; enum ggml_type wtype = GGML_TYPE_F16; params["weight"] = ggml_new_tensor_4d(ctx, wtype, @@ -2242,8 +2329,17 @@ class Conv3d : public UnaryBlock { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; struct ggml_tensor* b = nullptr; + if (ctx->weight_adapter) { + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); + if (w->type != GGML_TYPE_F16) { + w = ggml_cast(ctx->ggml_ctx, w, GGML_TYPE_F16); + } + } if (bias) { b = params["bias"]; + if (ctx->weight_adapter) { + b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias"); + } } return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels, std::get<2>(stride), std::get<1>(stride), std::get<0>(stride), @@ -2258,8 +2354,10 @@ class LayerNorm : public UnaryBlock { float eps; bool elementwise_affine; bool bias; + std::string prefix; void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + this->prefix = prefix; if (elementwise_affine) { enum ggml_type wtype = GGML_TYPE_F32; params["weight"] = ggml_new_tensor_1d(ctx, wtype, normalized_shape); @@ -2286,8 +2384,14 @@ class LayerNorm : public UnaryBlock { if (elementwise_affine) { w = params["weight"]; + if (ctx->weight_adapter) { + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); + } if (bias) { b = params["bias"]; + if (ctx->weight_adapter) { + b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias"); + } } } return ggml_ext_layer_norm(ctx->ggml_ctx, x, w, b, eps); @@ -2300,8 +2404,10 @@ class GroupNorm : public GGMLBlock { int64_t num_channels; float eps; bool affine; + std::string prefix; void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override { + this->prefix = prefix; if (affine) { enum ggml_type wtype = GGML_TYPE_F32; enum ggml_type bias_wtype = GGML_TYPE_F32; @@ -2326,6 +2432,10 @@ class GroupNorm : public GGMLBlock { if (affine) { w = params["weight"]; b = params["bias"]; + if (ctx->weight_adapter) { + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); + b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias"); + } } return ggml_ext_group_norm(ctx->ggml_ctx, x, w, b, num_groups); } @@ -2341,8 +2451,10 @@ class RMSNorm : public UnaryBlock { protected: int64_t hidden_size; float eps; + std::string prefix; void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { + this->prefix = prefix; enum ggml_type wtype = GGML_TYPE_F32; params["weight"] = ggml_new_tensor_1d(ctx, wtype, hidden_size); } @@ -2355,8 +2467,11 @@ class RMSNorm : public UnaryBlock { struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { struct ggml_tensor* w = params["weight"]; - x = ggml_rms_norm(ctx->ggml_ctx, x, eps); - x = ggml_mul_inplace(ctx->ggml_ctx, x, w); + if (ctx->weight_adapter) { + w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight"); + } + x = ggml_rms_norm(ctx->ggml_ctx, x, eps); + x = ggml_mul_inplace(ctx->ggml_ctx, x, w); return x; } }; diff --git a/lora.hpp b/lora.hpp index 6da9d833b..daabd4a01 100644 --- a/lora.hpp +++ b/lora.hpp @@ -7,22 +7,25 @@ #define LORA_GRAPH_BASE_SIZE 10240 struct LoraModel : public GGMLRunner { + std::string lora_id; float multiplier = 1.0f; - std::map lora_tensors; + std::unordered_map lora_tensors; std::map original_tensor_to_final_tensor; + std::set applied_lora_tensors; std::string file_path; ModelLoader model_loader; - bool load_failed = false; - bool applied = false; - bool tensor_preprocessed = false; - std::vector zero_index_vec = {0}; - ggml_tensor* zero_index = nullptr; + bool load_failed = false; + bool applied = false; + bool tensor_preprocessed = false; - LoraModel(ggml_backend_t backend, + typedef std::function filter_t; + + LoraModel(const std::string& lora_id, + ggml_backend_t backend, const std::string& file_path = "", std::string prefix = "", SDVersion version = VERSION_COUNT) - : file_path(file_path), GGMLRunner(backend, false) { + : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, false) { prefix = "lora." + prefix; if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) { load_failed = true; @@ -33,7 +36,7 @@ struct LoraModel : public GGMLRunner { return "lora"; } - bool load_from_file(bool filter_tensor, int n_threads) { + bool load_from_file(int n_threads, filter_t filter = nullptr) { LOG_INFO("loading LoRA from '%s'", file_path.c_str()); if (load_failed) { @@ -48,7 +51,7 @@ struct LoraModel : public GGMLRunner { if (dry_run) { const std::string& name = tensor_storage.name; - if (filter_tensor && !contains(name, "lora.model")) { + if (filter && !filter(name)) { return true; } @@ -68,6 +71,10 @@ struct LoraModel : public GGMLRunner { model_loader.load_tensors(on_new_tensor_cb, n_threads); + if (tensors_to_create.empty()) { + return true; + } + for (const auto& pair : tensors_to_create) { const auto& name = pair.first; const auto& ts = pair.second; @@ -87,14 +94,6 @@ struct LoraModel : public GGMLRunner { return true; } - ggml_tensor* to_f32(ggml_context* ctx, ggml_tensor* a) { - auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a)); - out = ggml_get_rows(ctx, out, zero_index); - out = ggml_reshape(ctx, out, a); - // auto out = ggml_cast(ctx, a, GGML_TYPE_F32); - return out; - } - void preprocess_lora_tensors(const std::map& model_tensors) { if (tensor_preprocessed) { return; @@ -102,7 +101,7 @@ struct LoraModel : public GGMLRunner { tensor_preprocessed = true; // I really hate these hardcoded processes. if (model_tensors.find("cond_stage_model.1.transformer.text_model.encoder.layers.0.self_attn.in_proj.weight") != model_tensors.end()) { - std::map new_lora_tensors; + std::unordered_map new_lora_tensors; for (auto& [old_name, tensor] : lora_tensors) { std::string new_name = old_name; @@ -130,7 +129,7 @@ struct LoraModel : public GGMLRunner { } } - ggml_tensor* get_lora_diff(const std::string& model_tensor_name, std::set& applied_lora_tensors) { + ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -153,17 +152,17 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(lora_up_name); if (iter != lora_tensors.end()) { - lora_up = to_f32(compute_ctx, iter->second); + lora_up = ggml_ext_cast_f32(ctx, iter->second); } iter = lora_tensors.find(lora_mid_name); if (iter != lora_tensors.end()) { - lora_mid = to_f32(compute_ctx, iter->second); + lora_mid = ggml_ext_cast_f32(ctx, iter->second); } iter = lora_tensors.find(lora_down_name); if (iter != lora_tensors.end()) { - lora_down = to_f32(compute_ctx, iter->second); + lora_down = ggml_ext_cast_f32(ctx, iter->second); } if (lora_up == nullptr || lora_down == nullptr) { @@ -195,32 +194,61 @@ struct LoraModel : public GGMLRunner { } scale_value *= multiplier; - auto curr_updown = ggml_ext_merge_lora(compute_ctx, lora_down, lora_up, lora_mid); - curr_updown = ggml_scale_inplace(compute_ctx, curr_updown, scale_value); + auto curr_updown = ggml_ext_merge_lora(ctx, lora_down, lora_up, lora_mid); + curr_updown = ggml_scale_inplace(ctx, curr_updown, scale_value); if (updown == nullptr) { updown = curr_updown; } else { - updown = ggml_concat(compute_ctx, updown, curr_updown, ggml_n_dims(updown) - 1); + updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1); } index++; } + return updown; + } - // diff - if (updown == nullptr) { - std::string lora_diff_name = "lora." + model_tensor_name + ".diff"; + ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { + ggml_tensor* updown = nullptr; + int index = 0; + while (true) { + std::string key; + if (index == 0) { + key = model_tensor_name; + } else { + key = model_tensor_name + "." + std::to_string(index); + } + + std::string diff_name = "lora." + key + ".diff"; + + ggml_tensor* curr_updown = nullptr; + + auto iter = lora_tensors.find(diff_name); + if (iter != lora_tensors.end()) { + curr_updown = ggml_ext_cast_f32(ctx, iter->second); + } else { + break; + } + + applied_lora_tensors.insert(diff_name); + + float scale_value = 1.0f; + scale_value *= multiplier; + + curr_updown = ggml_scale_inplace(ctx, curr_updown, scale_value); - if (lora_tensors.find(lora_diff_name) != lora_tensors.end()) { - updown = to_f32(compute_ctx, lora_tensors[lora_diff_name]); - applied_lora_tensors.insert(lora_diff_name); + if (updown == nullptr) { + updown = curr_updown; + } else { + updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1); } - } + index++; + } return updown; } - ggml_tensor* get_loha_diff(const std::string& model_tensor_name, std::set& applied_lora_tensors) { + ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -248,34 +276,34 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(hada_1_down_name); if (iter != lora_tensors.end()) { - hada_1_down = to_f32(compute_ctx, iter->second); + hada_1_down = ggml_ext_cast_f32(ctx, iter->second); } iter = lora_tensors.find(hada_1_up_name); if (iter != lora_tensors.end()) { - hada_1_up = to_f32(compute_ctx, iter->second); + hada_1_up = ggml_ext_cast_f32(ctx, iter->second); } iter = lora_tensors.find(hada_1_mid_name); if (iter != lora_tensors.end()) { - hada_1_mid = to_f32(compute_ctx, iter->second); - hada_1_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_1_up)); + hada_1_mid = ggml_ext_cast_f32(ctx, iter->second); + hada_1_up = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up)); } iter = lora_tensors.find(hada_2_down_name); if (iter != lora_tensors.end()) { - hada_2_down = to_f32(compute_ctx, iter->second); + hada_2_down = ggml_ext_cast_f32(ctx, iter->second); } iter = lora_tensors.find(hada_2_up_name); if (iter != lora_tensors.end()) { - hada_2_up = to_f32(compute_ctx, iter->second); + hada_2_up = ggml_ext_cast_f32(ctx, iter->second); } iter = lora_tensors.find(hada_2_mid_name); if (iter != lora_tensors.end()) { - hada_2_mid = to_f32(compute_ctx, iter->second); - hada_2_up = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_2_up)); + hada_2_mid = ggml_ext_cast_f32(ctx, iter->second); + hada_2_up = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up)); } if (hada_1_up == nullptr || hada_1_down == nullptr || hada_2_up == nullptr || hada_2_down == nullptr) { @@ -309,21 +337,21 @@ struct LoraModel : public GGMLRunner { } scale_value *= multiplier; - struct ggml_tensor* updown_1 = ggml_ext_merge_lora(compute_ctx, hada_1_down, hada_1_up, hada_1_mid); - struct ggml_tensor* updown_2 = ggml_ext_merge_lora(compute_ctx, hada_2_down, hada_2_up, hada_2_mid); - auto curr_updown = ggml_mul_inplace(compute_ctx, updown_1, updown_2); - curr_updown = ggml_scale_inplace(compute_ctx, curr_updown, scale_value); + struct ggml_tensor* updown_1 = ggml_ext_merge_lora(ctx, hada_1_down, hada_1_up, hada_1_mid); + struct ggml_tensor* updown_2 = ggml_ext_merge_lora(ctx, hada_2_down, hada_2_up, hada_2_mid); + auto curr_updown = ggml_mul_inplace(ctx, updown_1, updown_2); + curr_updown = ggml_scale_inplace(ctx, curr_updown, scale_value); if (updown == nullptr) { updown = curr_updown; } else { - updown = ggml_concat(compute_ctx, updown, curr_updown, ggml_n_dims(updown) - 1); + updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1); } index++; } return updown; } - ggml_tensor* get_lokr_diff(const std::string& model_tensor_name, std::set& applied_lora_tensors) { + ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) { ggml_tensor* updown = nullptr; int index = 0; while (true) { @@ -350,24 +378,24 @@ struct LoraModel : public GGMLRunner { auto iter = lora_tensors.find(lokr_w1_name); if (iter != lora_tensors.end()) { - lokr_w1 = to_f32(compute_ctx, iter->second); + lokr_w1 = ggml_ext_cast_f32(ctx, iter->second); } iter = lora_tensors.find(lokr_w2_name); if (iter != lora_tensors.end()) { - lokr_w2 = to_f32(compute_ctx, iter->second); + lokr_w2 = ggml_ext_cast_f32(ctx, iter->second); } int64_t rank = 1; if (lokr_w1 == nullptr) { iter = lora_tensors.find(lokr_w1_a_name); if (iter != lora_tensors.end()) { - lokr_w1_a = to_f32(compute_ctx, iter->second); + lokr_w1_a = ggml_ext_cast_f32(ctx, iter->second); } iter = lora_tensors.find(lokr_w1_b_name); if (iter != lora_tensors.end()) { - lokr_w1_b = to_f32(compute_ctx, iter->second); + lokr_w1_b = ggml_ext_cast_f32(ctx, iter->second); } if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) { @@ -376,18 +404,18 @@ struct LoraModel : public GGMLRunner { rank = lokr_w1_b->ne[ggml_n_dims(lokr_w1_b) - 1]; - lokr_w1 = ggml_ext_merge_lora(compute_ctx, lokr_w1_b, lokr_w1_a); + lokr_w1 = ggml_ext_merge_lora(ctx, lokr_w1_b, lokr_w1_a); } if (lokr_w2 == nullptr) { iter = lora_tensors.find(lokr_w2_a_name); if (iter != lora_tensors.end()) { - lokr_w2_a = to_f32(compute_ctx, iter->second); + lokr_w2_a = ggml_ext_cast_f32(ctx, iter->second); } iter = lora_tensors.find(lokr_w2_b_name); if (iter != lora_tensors.end()) { - lokr_w2_b = to_f32(compute_ctx, iter->second); + lokr_w2_b = ggml_ext_cast_f32(ctx, iter->second); } if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) { @@ -396,7 +424,7 @@ struct LoraModel : public GGMLRunner { rank = lokr_w2_b->ne[ggml_n_dims(lokr_w2_b) - 1]; - lokr_w2 = ggml_ext_merge_lora(compute_ctx, lokr_w2_b, lokr_w2_a); + lokr_w2 = ggml_ext_merge_lora(ctx, lokr_w2_b, lokr_w2_a); } if (!lokr_w1_a) { @@ -427,49 +455,208 @@ struct LoraModel : public GGMLRunner { scale_value *= multiplier; - auto curr_updown = ggml_ext_kronecker(compute_ctx, lokr_w1, lokr_w2); - curr_updown = ggml_scale_inplace(compute_ctx, curr_updown, scale_value); + auto curr_updown = ggml_ext_kronecker(ctx, lokr_w1, lokr_w2); + curr_updown = ggml_scale_inplace(ctx, curr_updown, scale_value); if (updown == nullptr) { updown = curr_updown; } else { - updown = ggml_concat(compute_ctx, updown, curr_updown, ggml_n_dims(updown) - 1); + updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1); } index++; } return updown; } + ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora = true) { + // lora + ggml_tensor* diff = nullptr; + if (with_lora) { + diff = get_lora_weight_diff(model_tensor_name, ctx); + } + // diff + if (diff == nullptr) { + diff = get_raw_weight_diff(model_tensor_name, ctx); + } + // loha + if (diff == nullptr) { + diff = get_loha_weight_diff(model_tensor_name, ctx); + } + // lokr + if (diff == nullptr) { + diff = get_lokr_weight_diff(model_tensor_name, ctx); + } + if (diff != nullptr) { + if (ggml_nelements(diff) < ggml_nelements(model_tensor)) { + if (ggml_n_dims(diff) == 2 && ggml_n_dims(model_tensor) == 2 && diff->ne[0] == model_tensor->ne[0]) { + LOG_WARN("pad for %s", model_tensor_name.c_str()); + auto pad_tensor = ggml_ext_zeros(ctx, diff->ne[0], model_tensor->ne[1] - diff->ne[1], 1, 1); + diff = ggml_concat(ctx, diff, pad_tensor, 1); + } + } + + GGML_ASSERT(ggml_nelements(diff) == ggml_nelements(model_tensor)); + diff = ggml_reshape(ctx, diff, model_tensor); + } + return diff; + } + + ggml_tensor* get_out_diff(ggml_context* ctx, + ggml_tensor* x, + WeightAdapter::ForwardParams forward_params, + const std::string& model_tensor_name) { + ggml_tensor* out_diff = nullptr; + int index = 0; + while (true) { + std::string key; + if (index == 0) { + key = model_tensor_name; + } else { + key = model_tensor_name + "." + std::to_string(index); + } + + std::string lora_down_name = "lora." + key + ".lora_down"; + std::string lora_up_name = "lora." + key + ".lora_up"; + std::string lora_mid_name = "lora." + key + ".lora_mid"; + std::string scale_name = "lora." + key + ".scale"; + std::string alpha_name = "lora." + key + ".alpha"; + + ggml_tensor* lora_up = nullptr; + ggml_tensor* lora_mid = nullptr; + ggml_tensor* lora_down = nullptr; + + bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D; + + auto iter = lora_tensors.find(lora_up_name); + if (iter != lora_tensors.end()) { + lora_up = iter->second; + if (is_conv2d && lora_up->type != GGML_TYPE_F16) { + lora_up = ggml_cast(ctx, lora_up, GGML_TYPE_F16); + } + } + + iter = lora_tensors.find(lora_mid_name); + if (iter != lora_tensors.end()) { + lora_mid = iter->second; + if (is_conv2d && lora_mid->type != GGML_TYPE_F16) { + lora_mid = ggml_cast(ctx, lora_mid, GGML_TYPE_F16); + } + } + + iter = lora_tensors.find(lora_down_name); + if (iter != lora_tensors.end()) { + lora_down = iter->second; + if (is_conv2d && lora_down->type != GGML_TYPE_F16) { + lora_down = ggml_cast(ctx, lora_down, GGML_TYPE_F16); + } + } + + if (lora_up == nullptr || lora_down == nullptr) { + break; + } + + applied_lora_tensors.insert(lora_up_name); + applied_lora_tensors.insert(lora_down_name); + + if (lora_mid) { + applied_lora_tensors.insert(lora_mid_name); + } + + float scale_value = 1.0f; + + int64_t rank = lora_down->ne[ggml_n_dims(lora_down) - 1]; + iter = lora_tensors.find(scale_name); + if (iter != lora_tensors.end()) { + scale_value = ggml_ext_backend_tensor_get_f32(iter->second); + applied_lora_tensors.insert(scale_name); + } else { + iter = lora_tensors.find(alpha_name); + if (iter != lora_tensors.end()) { + float alpha = ggml_ext_backend_tensor_get_f32(iter->second); + scale_value = alpha / rank; + // LOG_DEBUG("rank %s %ld %.2f %.2f", alpha_name.c_str(), rank, alpha, scale_value); + applied_lora_tensors.insert(alpha_name); + } + } + scale_value *= multiplier; + + ggml_tensor* lx; + if (!is_conv2d) { + lx = ggml_ext_linear(ctx, x, lora_down, nullptr, forward_params.linear.force_prec_f32, forward_params.linear.scale); + if (lora_mid) { + lx = ggml_ext_linear(ctx, lx, lora_mid, nullptr, forward_params.linear.force_prec_f32, forward_params.linear.scale); + } + lx = ggml_ext_linear(ctx, lx, lora_up, nullptr, forward_params.linear.force_prec_f32, forward_params.linear.scale); + } else { // OP_CONV2D + lx = ggml_ext_conv_2d(ctx, + x, + lora_down, + nullptr, + forward_params.conv2d.s0, + forward_params.conv2d.s1, + forward_params.conv2d.p0, + forward_params.conv2d.p1, + forward_params.conv2d.d0, + forward_params.conv2d.d1, + forward_params.conv2d.direct, + forward_params.conv2d.scale); + if (lora_mid) { + lx = ggml_ext_conv_2d(ctx, + lx, + lora_mid, + nullptr, + 1, + 1, + 0, + 0, + 1, + 1, + forward_params.conv2d.direct, + forward_params.conv2d.scale); + } + lx = ggml_ext_conv_2d(ctx, + lx, + lora_up, + nullptr, + 1, + 1, + 0, + 0, + 1, + 1, + forward_params.conv2d.direct, + forward_params.conv2d.scale); + } + + auto curr_out_diff = ggml_scale_inplace(ctx, lx, scale_value); + + if (out_diff == nullptr) { + out_diff = curr_out_diff; + } else { + out_diff = ggml_concat(ctx, out_diff, curr_out_diff, ggml_n_dims(out_diff) - 1); + } + + index++; + } + return out_diff; + } + struct ggml_cgraph* build_lora_graph(const std::map& model_tensors, SDVersion version) { size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10; struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false); - zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1); - set_backend_tensor_data(zero_index, zero_index_vec.data()); - ggml_build_forward_expand(gf, zero_index); - preprocess_lora_tensors(model_tensors); original_tensor_to_final_tensor.clear(); + applied_lora_tensors.clear(); - std::set applied_lora_tensors; for (auto it : model_tensors) { std::string model_tensor_name = it.first; ggml_tensor* model_tensor = it.second; // lora - ggml_tensor* updown = get_lora_diff(model_tensor_name, applied_lora_tensors); - // loha - if (updown == nullptr) { - updown = get_loha_diff(model_tensor_name, applied_lora_tensors); - } - - // lokr - if (updown == nullptr) { - updown = get_lokr_diff(model_tensor_name, applied_lora_tensors); - } - - if (updown == nullptr) { + ggml_tensor* diff = get_weight_diff(model_tensor_name, compute_ctx, model_tensor); + if (diff == nullptr) { continue; } @@ -479,38 +666,49 @@ struct LoraModel : public GGMLRunner { set_backend_tensor_data(model_tensor, original_tensor->data); } - if (ggml_nelements(updown) < ggml_nelements(model_tensor)) { - if (ggml_n_dims(updown) == 2 && ggml_n_dims(model_tensor) == 2 && updown->ne[0] == model_tensor->ne[0]) { - LOG_WARN("pad for %s", model_tensor_name.c_str()); - auto pad_tensor = ggml_ext_zeros(compute_ctx, updown->ne[0], model_tensor->ne[1] - updown->ne[1], 1, 1); - updown = ggml_concat(compute_ctx, updown, pad_tensor, 1); - } - } - - GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(model_tensor)); - updown = ggml_reshape(compute_ctx, updown, model_tensor); ggml_tensor* final_tensor; if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) { - final_tensor = to_f32(compute_ctx, model_tensor); - final_tensor = ggml_add_inplace(compute_ctx, final_tensor, updown); + final_tensor = ggml_ext_cast_f32(compute_ctx, model_tensor); + final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff); final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor); } else { - final_tensor = ggml_add_inplace(compute_ctx, model_tensor, updown); + final_tensor = ggml_add_inplace(compute_ctx, model_tensor, diff); } ggml_build_forward_expand(gf, final_tensor); if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) { original_tensor_to_final_tensor[original_tensor] = final_tensor; } } + return gf; + } + + void apply(std::map model_tensors, SDVersion version, int n_threads) { + auto get_graph = [&]() -> struct ggml_cgraph* { + return build_lora_graph(model_tensors, version); + }; + GGMLRunner::compute(get_graph, n_threads, false); + stat(); + for (auto item : original_tensor_to_final_tensor) { + ggml_tensor* original_tensor = item.first; + ggml_tensor* final_tensor = item.second; + + ggml_backend_tensor_copy(final_tensor, original_tensor); + } + original_tensor_to_final_tensor.clear(); + GGMLRunner::free_compute_buffer(); + } + + void stat(bool at_runntime = false) { size_t total_lora_tensors_count = 0; size_t applied_lora_tensors_count = 0; for (auto& kv : lora_tensors) { total_lora_tensors_count++; if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) { - LOG_WARN("unused lora tensor |%s|", kv.first.c_str()); - print_ggml_tensor(kv.second, true); - // exit(0); + if (!at_runntime) { + LOG_WARN("unused lora tensor |%s|", kv.first.c_str()); + print_ggml_tensor(kv.second, true); + } } else { applied_lora_tensors_count++; } @@ -518,30 +716,87 @@ struct LoraModel : public GGMLRunner { /* Don't worry if this message shows up twice in the logs per LoRA, * this function is called once to calculate the required buffer size * and then again to actually generate a graph to be used */ - if (applied_lora_tensors_count != total_lora_tensors_count) { - LOG_WARN("Only (%lu / %lu) LoRA tensors will be applied", - applied_lora_tensors_count, total_lora_tensors_count); + if (!at_runntime && applied_lora_tensors_count != total_lora_tensors_count) { + LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied, lora_file_path = %s", + applied_lora_tensors_count, total_lora_tensors_count, file_path.c_str()); } else { - LOG_DEBUG("(%lu / %lu) LoRA tensors will be applied", - applied_lora_tensors_count, total_lora_tensors_count); + LOG_INFO("(%lu / %lu) LoRA tensors have been applied, lora_file_path = %s", + applied_lora_tensors_count, total_lora_tensors_count, file_path.c_str()); } + } +}; - return gf; +struct MultiLoraAdapter : public WeightAdapter { +protected: + std::vector> lora_models; + +public: + explicit MultiLoraAdapter(const std::vector>& lora_models) + : lora_models(lora_models) { } - void apply(std::map model_tensors, SDVersion version, int n_threads) { - auto get_graph = [&]() -> struct ggml_cgraph* { - return build_lora_graph(model_tensors, version); - }; - GGMLRunner::compute(get_graph, n_threads, false); - for (auto item : original_tensor_to_final_tensor) { - ggml_tensor* original_tensor = item.first; - ggml_tensor* final_tensor = item.second; + ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora) { + for (auto& lora_model : lora_models) { + ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora); + if (diff == nullptr) { + continue; + } - ggml_backend_tensor_copy(final_tensor, original_tensor); + if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) { + weight = ggml_ext_cast_f32(ctx, weight); + } + weight = ggml_add(ctx, weight, diff); } - original_tensor_to_final_tensor.clear(); - GGMLRunner::free_compute_buffer(); + return weight; + } + + ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) override { + return patch_weight(ctx, weight, weight_name, true); + } + + ggml_tensor* forward_with_lora(ggml_context* ctx, + ggml_tensor* x, + ggml_tensor* w, + ggml_tensor* b, + const std::string& prefix, + WeightAdapter::ForwardParams forward_params) override { + w = patch_weight(ctx, w, prefix + "weight", false); + if (b) { + b = patch_weight(ctx, b, prefix + "bias", false); + } + ggml_tensor* out; + if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) { + out = ggml_ext_linear(ctx, x, w, b, forward_params.linear.force_prec_f32, forward_params.linear.scale); + } else { // OP_CONV2D + out = ggml_ext_conv_2d(ctx, + x, + w, + b, + forward_params.conv2d.s0, + forward_params.conv2d.s1, + forward_params.conv2d.p0, + forward_params.conv2d.p1, + forward_params.conv2d.d0, + forward_params.conv2d.d1, + forward_params.conv2d.direct, + forward_params.conv2d.scale); + } + for (auto& lora_model : lora_models) { + ggml_tensor* out_diff = lora_model->get_out_diff(ctx, x, forward_params, prefix + "weight"); + if (out_diff == nullptr) { + continue; + } + out = ggml_add_inplace(ctx, out, out_diff); + } + return out; + } + + size_t get_extra_graph_size() override { + size_t lora_tensor_num = 0; + for (auto& lora_model : lora_models) { + lora_tensor_num += lora_model->lora_tensors.size(); + } + return LORA_GRAPH_BASE_SIZE + lora_tensor_num * 10; } }; diff --git a/mmdit.hpp b/mmdit.hpp index 3ca01d952..c243e034a 100644 --- a/mmdit.hpp +++ b/mmdit.hpp @@ -870,7 +870,7 @@ struct MMDiTRunner : public GGMLRunner { struct ggml_tensor* context, struct ggml_tensor* y, std::vector skip_layers = std::vector()) { - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false); + struct ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE); x = to_backend(x); context = to_backend(context); diff --git a/model.cpp b/model.cpp index 519284e6a..dac6e88f5 100644 --- a/model.cpp +++ b/model.cpp @@ -1254,15 +1254,59 @@ std::map ModelLoader::get_vae_wtype_stat() { return wtype_stat; } -void ModelLoader::set_wtype_override(ggml_type wtype, std::string prefix) { +static std::vector> parse_tensor_type_rules(const std::string& tensor_type_rules) { + std::vector> result; + for (const auto& item : split_string(tensor_type_rules, ',')) { + if (item.size() == 0) + continue; + std::string::size_type pos = item.find('='); + if (pos == std::string::npos) { + LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str()); + continue; + } + std::string tensor_pattern = item.substr(0, pos); + std::string type_name = item.substr(pos + 1); + + ggml_type tensor_type = GGML_TYPE_COUNT; + + if (type_name == "f32") { + tensor_type = GGML_TYPE_F32; + } else { + for (size_t i = 0; i < GGML_TYPE_COUNT; i++) { + auto trait = ggml_get_type_traits((ggml_type)i); + if (trait->to_float && trait->type_size && type_name == trait->type_name) { + tensor_type = (ggml_type)i; + } + } + } + + if (tensor_type != GGML_TYPE_COUNT) { + result.emplace_back(tensor_pattern, tensor_type); + } else { + LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str()); + } + } + return result; +} + +void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_rules) { + auto map_rules = parse_tensor_type_rules(tensor_type_rules); for (auto& [name, tensor_storage] : tensor_storage_map) { - if (!starts_with(name, prefix)) { + ggml_type dst_type = wtype; + for (const auto& tensor_type_rule : map_rules) { + std::regex pattern(tensor_type_rule.first); + if (std::regex_search(name, pattern)) { + dst_type = tensor_type_rule.second; + break; + } + } + if (dst_type == GGML_TYPE_COUNT) { continue; } - if (!tensor_should_be_converted(tensor_storage, wtype)) { + if (!tensor_should_be_converted(tensor_storage, dst_type)) { continue; } - tensor_storage.expected_type = wtype; + tensor_storage.expected_type = dst_type; } } @@ -1603,41 +1647,6 @@ bool ModelLoader::load_tensors(std::map& tenso return true; } -std::vector> parse_tensor_type_rules(const std::string& tensor_type_rules) { - std::vector> result; - for (const auto& item : split_string(tensor_type_rules, ',')) { - if (item.size() == 0) - continue; - std::string::size_type pos = item.find('='); - if (pos == std::string::npos) { - LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str()); - continue; - } - std::string tensor_pattern = item.substr(0, pos); - std::string type_name = item.substr(pos + 1); - - ggml_type tensor_type = GGML_TYPE_COUNT; - - if (type_name == "f32") { - tensor_type = GGML_TYPE_F32; - } else { - for (size_t i = 0; i < GGML_TYPE_COUNT; i++) { - auto trait = ggml_get_type_traits((ggml_type)i); - if (trait->to_float && trait->type_size && type_name == trait->type_name) { - tensor_type = (ggml_type)i; - } - } - } - - if (tensor_type != GGML_TYPE_COUNT) { - result.emplace_back(tensor_pattern, tensor_type); - } else { - LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str()); - } - } - return result; -} - bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) { const std::string& name = tensor_storage.name; if (type != GGML_TYPE_COUNT) { diff --git a/model.h b/model.h index 588f98219..2ac079fb5 100644 --- a/model.h +++ b/model.h @@ -292,7 +292,7 @@ class ModelLoader { std::map get_diffusion_model_wtype_stat(); std::map get_vae_wtype_stat(); String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; } - void set_wtype_override(ggml_type wtype, std::string prefix = ""); + void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = ""); bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0); bool load_tensors(std::map& tensors, std::set ignore_tensors = {}, diff --git a/name_conversion.cpp b/name_conversion.cpp index ea2702a71..c50baa518 100644 --- a/name_conversion.cpp +++ b/name_conversion.cpp @@ -855,6 +855,49 @@ std::string convert_sep_to_dot(std::string name) { return name; } +std::vector cond_stage_model_prefix_vec = { + "cond_stage_model.1.", + "cond_stage_model.", + "conditioner.embedders.", + "text_encoders.", +}; + +std::vector diffuison_model_prefix_vec = { + "model.diffusion_model.", +}; + +std::vector first_stage_model_prefix_vec = { + "first_stage_model.", + "vae.", +}; + +bool is_cond_stage_model_name(const std::string& name) { + for (const auto& prefix : cond_stage_model_prefix_vec) { + if (starts_with(name, prefix) || starts_with(name, "lora." + prefix)) { + return true; + } + } + return false; +} + +bool is_diffusion_model_name(const std::string& name) { + for (const auto& prefix : diffuison_model_prefix_vec) { + if (starts_with(name, prefix) || starts_with(name, "lora." + prefix)) { + return true; + } + } + return false; +} + +bool is_first_stage_model_name(const std::string& name) { + for (const auto& prefix : first_stage_model_prefix_vec) { + if (starts_with(name, prefix) || starts_with(name, "lora." + prefix)) { + return true; + } + } + return false; +} + std::string convert_tensor_name(std::string name, SDVersion version) { bool is_lora = false; bool is_lycoris_underline = false; @@ -956,9 +999,6 @@ std::string convert_tensor_name(std::string name, SDVersion version) { // diffusion model { - std::vector diffuison_model_prefix_vec = { - "model.diffusion_model.", - }; for (const auto& prefix : diffuison_model_prefix_vec) { if (starts_with(name, prefix)) { name = convert_diffusion_model_name(name.substr(prefix.size()), prefix, version); @@ -970,12 +1010,6 @@ std::string convert_tensor_name(std::string name, SDVersion version) { // cond_stage_model { - std::vector cond_stage_model_prefix_vec = { - "cond_stage_model.1.", - "cond_stage_model.", - "conditioner.embedders.", - "text_encoders.", - }; for (const auto& prefix : cond_stage_model_prefix_vec) { if (starts_with(name, prefix)) { name = convert_cond_stage_model_name(name.substr(prefix.size()), prefix); @@ -987,10 +1021,6 @@ std::string convert_tensor_name(std::string name, SDVersion version) { // first_stage_model { - std::vector first_stage_model_prefix_vec = { - "first_stage_model.", - "vae.", - }; for (const auto& prefix : first_stage_model_prefix_vec) { if (starts_with(name, prefix)) { name = convert_first_stage_model_name(name.substr(prefix.size()), prefix); diff --git a/name_conversion.h b/name_conversion.h index eb3d1a9b9..3fefcf780 100644 --- a/name_conversion.h +++ b/name_conversion.h @@ -5,6 +5,10 @@ #include "model.h" +bool is_cond_stage_model_name(const std::string& name); +bool is_diffusion_model_name(const std::string& name); +bool is_first_stage_model_name(const std::string& name); + std::string convert_tensor_name(std::string name, SDVersion version); #endif // __NAME_CONVERSTION_H__ \ No newline at end of file diff --git a/qwen_image.hpp b/qwen_image.hpp index 87d2fb9b2..94ada47d7 100644 --- a/qwen_image.hpp +++ b/qwen_image.hpp @@ -543,7 +543,7 @@ namespace Qwen { std::vector ref_latents = {}, bool increase_ref_index = false) { GGML_ASSERT(x->ne[3] == 1); - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, QWEN_IMAGE_GRAPH_SIZE, false); + struct ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE); x = to_backend(x); context = to_backend(context); diff --git a/qwenvl.hpp b/qwenvl.hpp index 0a914f6c9..9bc268466 100644 --- a/qwenvl.hpp +++ b/qwenvl.hpp @@ -1049,7 +1049,7 @@ namespace Qwen { } struct ggml_cgraph* build_encode_image_graph(struct ggml_tensor* image) { - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, QWENVL_GRAPH_SIZE, false); + struct ggml_cgraph* gf = new_graph_custom(QWENVL_GRAPH_SIZE); GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); diff --git a/rng_mt19937.hpp b/rng_mt19937.hpp new file mode 100644 index 000000000..7e6199886 --- /dev/null +++ b/rng_mt19937.hpp @@ -0,0 +1,147 @@ +#ifndef __RNG_MT19937_HPP__ +#define __RNG_MT19937_HPP__ + +#include +#include + +#include "rng.hpp" + +// RNG imitiating torch cpu randn on CPU. +// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE +// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real +// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16 +// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine +// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution +class MT19937RNG : public RNG { + static const int N = 624; + static const int M = 397; + static const uint32_t MATRIX_A = 0x9908b0dfU; + static const uint32_t UMASK = 0x80000000U; + static const uint32_t LMASK = 0x7fffffffU; + + struct State { + uint64_t seed_; + int left_; + bool seeded_; + uint32_t next_; + std::array state_; + bool has_next_gauss = false; + double next_gauss = 0.0f; + }; + + State s; + + uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); } + uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); } + void next_state() { + uint32_t* p = s.state_.data(); + s.left_ = N; + s.next_ = 0; + for (int j = N - M + 1; --j; p++) + p[0] = p[M] ^ twist(p[0], p[1]); + for (int j = M; --j; p++) + p[0] = p[M - N] ^ twist(p[0], p[1]); + p[0] = p[M - N] ^ twist(p[0], s.state_[0]); + } + + uint32_t rand_uint32() { + if (--s.left_ == 0) + next_state(); + uint32_t y = s.state_[s.next_++]; + y ^= (y >> 11); + y ^= (y << 7) & 0x9d2c5680U; + y ^= (y << 15) & 0xefc60000U; + y ^= (y >> 18); + return y; + } + + uint64_t rand_uint64() { + uint64_t high = (uint64_t)rand_uint32(); + uint64_t low = (uint64_t)rand_uint32(); + return (high << 32) | low; + } + + template + T uniform_real(V val, T from, T to) { + constexpr auto MASK = static_cast((static_cast(1) << std::numeric_limits::digits) - 1); + constexpr auto DIVISOR = static_cast(1) / (static_cast(1) << std::numeric_limits::digits); + T x = (val & MASK) * DIVISOR; + return (x * (to - from) + from); + } + + double normal_double_value(double mean, double std) { + if (s.has_next_gauss) { + s.has_next_gauss = false; + return s.next_gauss; + } + double u1 = uniform_real(rand_uint64(), 0., 1.); // double + double u2 = uniform_real(rand_uint64(), 0., 1.); // double + + double r = std::sqrt(-2.0 * std::log1p(-u2)); + double theta = 2.0 * 3.14159265358979323846 * u1; + double value = r * std::cos(theta) * std + mean; + s.next_gauss = r * std::sin(theta) * std + mean; + s.has_next_gauss = true; + return value; + } + + void normal_fill_16(float* data, float mean, float std) { + for (int j = 0; j < 8; ++j) { + float u1 = 1.0f - data[j]; + float u2 = data[j + 8]; + float r = std::sqrt(-2.0f * std::log(u1)); + float theta = 2.0f * 3.14159265358979323846 * u2; + data[j] = r * std::cos(theta) * std + mean; + data[j + 8] = r * std::sin(theta) * std + mean; + } + } + + void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) { + if (size >= 16) { + for (int64_t i = 0; i < size; i++) { + data[i] = uniform_real(rand_uint32(), 0.f, 1.f); + } + for (int64_t i = 0; i < size - 15; i += 16) { + normal_fill_16(data + i, mean, std); + } + if (size % 16 != 0) { + // Recompute the last 16 values. + data = data + size - 16; + for (int64_t i = 0; i < 16; i++) { + data[i] = uniform_real(rand_uint32(), 0.f, 1.f); + } + normal_fill_16(data, mean, std); + } + } else { + // Strange handling, hard to understand, but keeping it consistent with PyTorch. + for (int64_t i = 0; i < size; i++) { + data[i] = (float)normal_double_value(mean, std); + } + } + } + +public: + MT19937RNG(uint64_t seed = 0) { manual_seed(seed); } + + void manual_seed(uint64_t seed) override { + s.seed_ = seed; + s.seeded_ = true; + s.state_[0] = (uint32_t)(seed & 0xffffffffU); + for (int j = 1; j < N; j++) { + uint32_t prev = s.state_[j - 1]; + s.state_[j] = 1812433253U * (prev ^ (prev >> 30)) + j; + } + s.left_ = 1; + s.next_ = 0; + s.has_next_gauss = false; + } + + std::vector randn(uint32_t n) override { + std::vector out; + out.resize(n); + randn((float*)out.data(), out.size()); + return out; + } +}; + +#endif // __RNG_MT19937_HPP__ \ No newline at end of file diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 4cea83a19..d28aee179 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -2,6 +2,7 @@ #include "model.h" #include "rng.hpp" +#include "rng_mt19937.hpp" #include "rng_philox.hpp" #include "stable-diffusion.h" #include "util.h" @@ -17,6 +18,7 @@ #include "vae.hpp" #include "latent-preview.h" +#include "name_conversion.h" const char* model_version_to_str[] = { "SD 1.x", @@ -97,10 +99,11 @@ class StableDiffusionGGML { bool vae_decode_only = false; bool free_params_immediately = false; - std::shared_ptr rng = std::make_shared(); - int n_threads = -1; - float scale_factor = 0.18215f; - float shift_factor = 0.f; + std::shared_ptr rng = std::make_shared(); + std::shared_ptr sampler_rng = nullptr; + int n_threads = -1; + float scale_factor = 0.18215f; + float shift_factor = 0.f; std::shared_ptr cond_stage_model; std::shared_ptr clip_vision; // for svd or wan2.1 i2v @@ -108,10 +111,14 @@ class StableDiffusionGGML { std::shared_ptr high_noise_diffusion_model; std::shared_ptr first_stage_model; std::shared_ptr tae_first_stage; - std::shared_ptr control_net = nullptr; + std::shared_ptr control_net; std::shared_ptr pmid_model; std::shared_ptr pmid_lora; std::shared_ptr pmid_id_embeds; + std::vector> cond_stage_lora_models; + std::vector> diffusion_lora_models; + std::vector> first_stage_lora_models; + bool apply_lora_immediately = false; std::string taesd_path; bool use_tiny_autoencoder = false; @@ -182,6 +189,16 @@ class StableDiffusionGGML { } } + std::shared_ptr get_rng(rng_type_t rng_type) { + if (rng_type == STD_DEFAULT_RNG) { + return std::make_shared(); + } else if (rng_type == CPU_RNG) { + return std::make_shared(); + } else { // default: CUDA_RNG + return std::make_shared(); + } + } + bool init(const sd_ctx_params_t* sd_ctx_params) { n_threads = sd_ctx_params->n_threads; vae_decode_only = sd_ctx_params->vae_decode_only; @@ -191,10 +208,11 @@ class StableDiffusionGGML { use_tiny_autoencoder = taesd_path.size() > 0; offload_params_to_cpu = sd_ctx_params->offload_params_to_cpu; - if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) { - rng = std::make_shared(); - } else if (sd_ctx_params->rng_type == CUDA_RNG) { - rng = std::make_shared(); + rng = get_rng(sd_ctx_params->rng_type); + if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT) { + sampler_rng = get_rng(sd_ctx_params->sampler_rng_type); + } else { + sampler_rng = rng; } ggml_log_set(ggml_log_callback_default, nullptr); @@ -296,11 +314,12 @@ class StableDiffusionGGML { } LOG_INFO("Version: %s ", model_version_to_str[version]); - ggml_type wtype = (int)sd_ctx_params->wtype < std::min(SD_TYPE_COUNT, GGML_TYPE_COUNT) - ? (ggml_type)sd_ctx_params->wtype - : GGML_TYPE_COUNT; - if (wtype != GGML_TYPE_COUNT) { - model_loader.set_wtype_override(wtype); + ggml_type wtype = (int)sd_ctx_params->wtype < std::min(SD_TYPE_COUNT, GGML_TYPE_COUNT) + ? (ggml_type)sd_ctx_params->wtype + : GGML_TYPE_COUNT; + std::string tensor_type_rules = SAFE_STR(sd_ctx_params->tensor_type_rules); + if (wtype != GGML_TYPE_COUNT || tensor_type_rules.size() > 0) { + model_loader.set_wtype_override(wtype, tensor_type_rules); } std::map wtype_stat = model_loader.get_wtype_stat(); @@ -329,6 +348,29 @@ class StableDiffusionGGML { LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor)); + if (sd_ctx_params->lora_apply_mode == LORA_APPLY_AUTO) { + bool have_quantized_weight = false; + if (wtype != GGML_TYPE_COUNT && ggml_is_quantized(wtype)) { + have_quantized_weight = true; + } else { + for (const auto& [type, _] : wtype_stat) { + if (ggml_is_quantized(type)) { + have_quantized_weight = true; + break; + } + } + } + if (have_quantized_weight) { + apply_lora_immediately = false; + } else { + apply_lora_immediately = true; + } + } else if (sd_ctx_params->lora_apply_mode == LORA_APPLY_IMMEDIATELY) { + apply_lora_immediately = true; + } else { + apply_lora_immediately = false; + } + if (sd_version_is_sdxl(version)) { scale_factor = 0.13025f; } else if (sd_version_is_sd3(version)) { @@ -571,8 +613,14 @@ class StableDiffusionGGML { version); } if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) { - pmid_lora = std::make_shared(backend, sd_ctx_params->photo_maker_path, "", version); - if (!pmid_lora->load_from_file(true, n_threads)) { + pmid_lora = std::make_shared("pmid", backend, sd_ctx_params->photo_maker_path, "", version); + auto lora_tensor_filter = [&](const std::string& tensor_name) { + if (starts_with(tensor_name, "lora.model")) { + return true; + } + return false; + }; + if (!pmid_lora->load_from_file(n_threads, lora_tensor_filter)) { LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path); return false; } @@ -907,8 +955,11 @@ class StableDiffusionGGML { return result < -1; } - void apply_lora(std::string lora_name, float multiplier) { - int64_t t0 = ggml_time_ms(); + std::shared_ptr load_lora_model_from_file(const std::string& lora_id, + float multiplier, + ggml_backend_t backend, + LoraModel::filter_t lora_tensor_filter = nullptr) { + std::string lora_name = lora_id; std::string high_noise_tag = "|high_noise|"; bool is_high_noise = false; if (starts_with(lora_name, high_noise_tag)) { @@ -925,25 +976,19 @@ class StableDiffusionGGML { file_path = ckpt_file_path; } else { LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str()); - return; + return nullptr; } - LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "", version); - if (!lora.load_from_file(false, n_threads)) { + auto lora = std::make_shared(lora_id, backend, file_path, is_high_noise ? "model.high_noise_" : "", version); + if (!lora->load_from_file(n_threads, lora_tensor_filter)) { LOG_WARN("load lora tensors from %s failed", file_path.c_str()); - return; + return nullptr; } - lora.multiplier = multiplier; - // TODO: send version? - lora.apply(tensors, version, n_threads); - lora.free_params_buffer(); - - int64_t t1 = ggml_time_ms(); - - LOG_INFO("lora '%s' applied, taking %.2fs", lora_name.c_str(), (t1 - t0) * 1.0f / 1000); + lora->multiplier = multiplier; + return lora; } - void apply_loras(const std::unordered_map& lora_state) { + void apply_loras_immediately(const std::unordered_map& lora_state) { std::unordered_map lora_state_diff; for (auto& kv : lora_state) { const std::string& lora_name = kv.first; @@ -964,12 +1009,149 @@ class StableDiffusionGGML { } for (auto& kv : lora_state_diff) { - apply_lora(kv.first, kv.second); + int64_t t0 = ggml_time_ms(); + + auto lora = load_lora_model_from_file(kv.first, kv.second, backend); + lora->apply(tensors, version, n_threads); + lora->free_params_buffer(); + + int64_t t1 = ggml_time_ms(); + + LOG_INFO("lora '%s' applied, taking %.2fs", kv.first.c_str(), (t1 - t0) * 1.0f / 1000); } curr_lora_state = lora_state; } + void apply_loras_at_runtime(const std::unordered_map& lora_state) { + cond_stage_lora_models.clear(); + diffusion_lora_models.clear(); + first_stage_lora_models.clear(); + if (cond_stage_model) { + std::vector> lora_models; + auto lora_state_diff = lora_state; + for (auto& lora_model : cond_stage_lora_models) { + auto iter = lora_state_diff.find(lora_model->lora_id); + + if (iter != lora_state_diff.end()) { + lora_model->multiplier = iter->second; + lora_models.push_back(lora_model); + lora_state_diff.erase(iter); + } + } + cond_stage_lora_models = lora_models; + auto lora_tensor_filter = [&](const std::string& tensor_name) { + if (is_cond_stage_model_name(tensor_name)) { + return true; + } + return false; + }; + for (auto& kv : lora_state_diff) { + const std::string& lora_id = kv.first; + float multiplier = kv.second; + + auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter); + if (lora && !lora->lora_tensors.empty()) { + lora->preprocess_lora_tensors(tensors); + cond_stage_lora_models.push_back(lora); + } + } + auto multi_lora_adapter = std::make_shared(cond_stage_lora_models); + cond_stage_model->set_weight_adapter(multi_lora_adapter); + } + if (diffusion_model) { + std::vector> lora_models; + auto lora_state_diff = lora_state; + for (auto& lora_model : diffusion_lora_models) { + auto iter = lora_state_diff.find(lora_model->lora_id); + + if (iter != lora_state_diff.end()) { + lora_model->multiplier = iter->second; + lora_models.push_back(lora_model); + lora_state_diff.erase(iter); + } + } + diffusion_lora_models = lora_models; + auto lora_tensor_filter = [&](const std::string& tensor_name) { + if (is_diffusion_model_name(tensor_name)) { + return true; + } + return false; + }; + for (auto& kv : lora_state_diff) { + const std::string& lora_name = kv.first; + float multiplier = kv.second; + + auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter); + if (lora && !lora->lora_tensors.empty()) { + lora->preprocess_lora_tensors(tensors); + diffusion_lora_models.push_back(lora); + } + } + auto multi_lora_adapter = std::make_shared(diffusion_lora_models); + diffusion_model->set_weight_adapter(multi_lora_adapter); + if (high_noise_diffusion_model) { + high_noise_diffusion_model->set_weight_adapter(multi_lora_adapter); + } + } + + if (first_stage_model) { + std::vector> lora_models; + auto lora_state_diff = lora_state; + for (auto& lora_model : first_stage_lora_models) { + auto iter = lora_state_diff.find(lora_model->lora_id); + + if (iter != lora_state_diff.end()) { + lora_model->multiplier = iter->second; + lora_models.push_back(lora_model); + lora_state_diff.erase(iter); + } + } + first_stage_lora_models = lora_models; + auto lora_tensor_filter = [&](const std::string& tensor_name) { + if (is_first_stage_model_name(tensor_name)) { + return true; + } + return false; + }; + for (auto& kv : lora_state_diff) { + const std::string& lora_name = kv.first; + float multiplier = kv.second; + + auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, lora_tensor_filter); + if (lora && !lora->lora_tensors.empty()) { + lora->preprocess_lora_tensors(tensors); + first_stage_lora_models.push_back(lora); + } + } + auto multi_lora_adapter = std::make_shared(first_stage_lora_models); + first_stage_model->set_weight_adapter(multi_lora_adapter); + } + } + + void lora_stat() { + if (!cond_stage_lora_models.empty()) { + LOG_INFO("cond_stage_lora_models:"); + for (auto& lora_model : cond_stage_lora_models) { + lora_model->stat(); + } + } + + if (!diffusion_lora_models.empty()) { + LOG_INFO("diffusion_lora_models:"); + for (auto& lora_model : diffusion_lora_models) { + lora_model->stat(); + } + } + + if (!first_stage_lora_models.empty()) { + LOG_INFO("first_stage_lora_models:"); + for (auto& lora_model : first_stage_lora_models) { + lora_model->stat(); + } + } + } + std::string apply_loras_from_prompt(const std::string& prompt) { auto result_pair = extract_and_remove_lora(prompt); std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier @@ -978,10 +1160,18 @@ class StableDiffusionGGML { LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); } int64_t t0 = ggml_time_ms(); - apply_loras(lora_f2m); + if (apply_lora_immediately) { + LOG_INFO("apply lora immediately"); + apply_loras_immediately(lora_f2m); + } else { + LOG_INFO("apply at runtime"); + apply_loras_at_runtime(lora_f2m); + } int64_t t1 = ggml_time_ms(); - LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str()); + if (!lora_f2m.empty()) { + LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); + LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str()); + } return result_pair.second; } @@ -1556,7 +1746,7 @@ class StableDiffusionGGML { return denoised; }; - sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta); + sample_k_diffusion(method, denoise, work_ctx, x, sigmas, sampler_rng, eta); if (inverse_noise_scaling) { x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); @@ -1955,6 +2145,7 @@ enum sd_type_t str_to_sd_type(const char* str) { const char* rng_type_to_str[] = { "std_default", "cuda", + "cpu", }; const char* sd_rng_type_name(enum rng_type_t rng_type) { @@ -2081,6 +2272,28 @@ enum preview_t str_to_preview(const char* str) { return PREVIEW_COUNT; } +const char* lora_apply_mode_to_str[] = { + "auto", + "immediately", + "at_runtime", +}; + +const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode) { + if (mode < LORA_APPLY_MODE_COUNT) { + return lora_apply_mode_to_str[mode]; + } + return NONE_STR; +} + +enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) { + for (int i = 0; i < LORA_APPLY_MODE_COUNT; i++) { + if (!strcmp(str, lora_apply_mode_to_str[i])) { + return (enum lora_apply_mode_t)i; + } + } + return LORA_APPLY_MODE_COUNT; +} + void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { *sd_ctx_params = {}; sd_ctx_params->vae_decode_only = true; @@ -2088,7 +2301,9 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) { sd_ctx_params->n_threads = get_num_physical_cores(); sd_ctx_params->wtype = SD_TYPE_COUNT; sd_ctx_params->rng_type = CUDA_RNG; + sd_ctx_params->sampler_rng_type = RNG_TYPE_COUNT; sd_ctx_params->prediction = DEFAULT_PRED; + sd_ctx_params->lora_apply_mode = LORA_APPLY_AUTO; sd_ctx_params->offload_params_to_cpu = false; sd_ctx_params->keep_clip_on_cpu = false; sd_ctx_params->keep_control_net_on_cpu = false; @@ -2122,11 +2337,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { "lora_model_dir: %s\n" "embedding_dir: %s\n" "photo_maker_path: %s\n" + "tensor_type_rules: %s\n" "vae_decode_only: %s\n" "free_params_immediately: %s\n" "n_threads: %d\n" "wtype: %s\n" "rng_type: %s\n" + "sampler_rng_type: %s\n" "prediction: %s\n" "offload_params_to_cpu: %s\n" "keep_clip_on_cpu: %s\n" @@ -2151,11 +2368,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) { SAFE_STR(sd_ctx_params->lora_model_dir), SAFE_STR(sd_ctx_params->embedding_dir), SAFE_STR(sd_ctx_params->photo_maker_path), + SAFE_STR(sd_ctx_params->tensor_type_rules), BOOL_STR(sd_ctx_params->vae_decode_only), BOOL_STR(sd_ctx_params->free_params_immediately), sd_ctx_params->n_threads, sd_type_name(sd_ctx_params->wtype), sd_rng_type_name(sd_ctx_params->rng_type), + sd_rng_type_name(sd_ctx_params->sampler_rng_type), sd_prediction_name(sd_ctx_params->prediction), BOOL_STR(sd_ctx_params->offload_params_to_cpu), BOOL_STR(sd_ctx_params->keep_clip_on_cpu), @@ -2454,18 +2673,24 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, LOG_WARN("Turn off PhotoMaker"); sd_ctx->sd->stacked_id = false; } else { - id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask); - int64_t t1 = ggml_time_ms(); - LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->pmid_model->free_params_buffer(); - } - // Encode input prompt without the trigger word for delayed conditioning - prompt_text_only = sd_ctx->sd->cond_stage_model->remove_trigger_from_prompt(work_ctx, prompt); - // printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str()); - prompt = prompt_text_only; // - if (sample_steps < 50) { - LOG_WARN("It's recommended to use >= 50 steps for photo maker!"); + if (pm_params.id_images_count != id_embeds->ne[1]) { + LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, id_embeds->ne[1]); + LOG_WARN("Turn off PhotoMaker"); + sd_ctx->sd->stacked_id = false; + } else { + id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask); + int64_t t1 = ggml_time_ms(); + LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->pmid_model->free_params_buffer(); + } + // Encode input prompt without the trigger word for delayed conditioning + prompt_text_only = sd_ctx->sd->cond_stage_model->remove_trigger_from_prompt(work_ctx, prompt); + // printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str()); + prompt = prompt_text_only; // + if (sample_steps < 50) { + LOG_WARN("It's recommended to use >= 50 steps for photo maker!"); + } } } } else { @@ -2611,6 +2836,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed); sd_ctx->sd->rng->manual_seed(cur_seed); + sd_ctx->sd->sampler_rng->manual_seed(cur_seed); struct ggml_tensor* x_t = init_latent; struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng); @@ -2674,6 +2900,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { sd_ctx->sd->first_stage_model->free_params_buffer(); } + + sd_ctx->sd->lora_stat(); + sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t)); if (result_images == nullptr) { ggml_free(work_ctx); @@ -2734,6 +2963,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g seed = rand(); } sd_ctx->sd->rng->manual_seed(seed); + sd_ctx->sd->sampler_rng->manual_seed(seed); int sample_steps = sd_img_gen_params->sample_params.sample_steps; @@ -3025,6 +3255,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s } sd_ctx->sd->rng->manual_seed(seed); + sd_ctx->sd->sampler_rng->manual_seed(seed); int64_t t0 = ggml_time_ms(); @@ -3343,6 +3574,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s sd_ctx->sd->first_stage_model->free_params_buffer(); } + sd_ctx->sd->lora_stat(); + sd_image_t* result_images = (sd_image_t*)calloc(vid->ne[2], sizeof(sd_image_t)); if (result_images == nullptr) { ggml_free(work_ctx); diff --git a/stable-diffusion.h b/stable-diffusion.h index 9e99d53de..6be85af22 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -31,6 +31,7 @@ extern "C" { enum rng_type_t { STD_DEFAULT_RNG, CUDA_RNG, + CPU_RNG, RNG_TYPE_COUNT }; @@ -134,6 +135,13 @@ enum preview_t { PREVIEW_COUNT }; +enum lora_apply_mode_t { + LORA_APPLY_AUTO, + LORA_APPLY_IMMEDIATELY, + LORA_APPLY_AT_RUNTIME, + LORA_APPLY_MODE_COUNT, +}; + typedef struct { bool enabled; int tile_size_x; @@ -159,12 +167,15 @@ typedef struct { const char* lora_model_dir; const char* embedding_dir; const char* photo_maker_path; + const char* tensor_type_rules; bool vae_decode_only; bool free_params_immediately; int n_threads; enum sd_type_t wtype; enum rng_type_t rng_type; + enum rng_type_t sampler_rng_type; enum prediction_t prediction; + enum lora_apply_mode_t lora_apply_mode; bool offload_params_to_cpu; bool keep_clip_on_cpu; bool keep_control_net_on_cpu; @@ -267,7 +278,7 @@ typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, b SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); -SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy); +SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy); SD_API int32_t get_num_physical_cores(); SD_API const char* sd_get_system_info(); @@ -283,6 +294,8 @@ SD_API const char* sd_prediction_name(enum prediction_t prediction); SD_API enum prediction_t str_to_prediction(const char* str); SD_API const char* sd_preview_name(enum preview_t preview); SD_API enum preview_t str_to_preview(const char* str); +SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode); +SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str); SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params); SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params); diff --git a/unet.hpp b/unet.hpp index 8f0adf389..de05f464d 100644 --- a/unet.hpp +++ b/unet.hpp @@ -7,7 +7,7 @@ /*==================================================== UnetModel =====================================================*/ -#define UNET_GRAPH_SIZE 10240 +#define UNET_GRAPH_SIZE 102400 class SpatialVideoTransformer : public SpatialTransformer { protected: @@ -612,7 +612,7 @@ struct UNetModelRunner : public GGMLRunner { int num_video_frames = -1, std::vector controls = {}, float control_strength = 0.f) { - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, UNET_GRAPH_SIZE, false); + struct ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE); if (num_video_frames == -1) { num_video_frames = x->ne[3]; diff --git a/util.cpp b/util.cpp index 1aa9beff8..c46216646 100644 --- a/util.cpp +++ b/util.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -547,6 +548,8 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe // (abc) - increases attention to abc by a multiplier of 1.1 // (abc:3.12) - increases attention to abc by a multiplier of 3.12 // [abc] - decreases attention to abc by a multiplier of 1.1 +// BREAK - separates the prompt into conceptually distinct parts for sequential processing +// B - internal helper pattern; prevents 'B' in 'BREAK' from being consumed as normal text // \( - literal character '(' // \[ - literal character '[' // \) - literal character ')' @@ -582,7 +585,7 @@ std::vector> parse_prompt_attention(const std::str float round_bracket_multiplier = 1.1f; float square_bracket_multiplier = 1 / 1.1f; - std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)"); + std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|\bBREAK\b|[^\\()\[\]:B]+|:|\bB)"); std::regex re_break(R"(\s*\bBREAK\b\s*)"); auto multiply_range = [&](int start_position, float multiplier) { @@ -591,7 +594,7 @@ std::vector> parse_prompt_attention(const std::str } }; - std::smatch m; + std::smatch m, m2; std::string remaining_text = text; while (std::regex_search(remaining_text, m, re_attention)) { @@ -615,6 +618,8 @@ std::vector> parse_prompt_attention(const std::str square_brackets.pop_back(); } else if (text == "\\(") { res.push_back({text.substr(1), 1.0f}); + } else if (std::regex_search(text, m2, re_break)) { + res.push_back({"BREAK", -1.0f}); } else { res.push_back({text, 1.0f}); } @@ -645,4 +650,4 @@ std::vector> parse_prompt_attention(const std::str } return res; -} \ No newline at end of file +} diff --git a/wan.hpp b/wan.hpp index 91a2e9205..41882e790 100644 --- a/wan.hpp +++ b/wan.hpp @@ -1133,7 +1133,7 @@ namespace WAN { } struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) { - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, 10240 * z->ne[2], false); + struct ggml_cgraph* gf = new_graph_custom(10240 * z->ne[2]); z = to_backend(z); @@ -1147,7 +1147,7 @@ namespace WAN { } struct ggml_cgraph* build_graph_partial(struct ggml_tensor* z, bool decode_graph, int64_t i) { - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, 20480, false); + struct ggml_cgraph* gf = new_graph_custom(20480); ae.clear_cache(); @@ -2142,7 +2142,7 @@ namespace WAN { struct ggml_tensor* time_dim_concat = nullptr, struct ggml_tensor* vace_context = nullptr, float vace_strength = 1.f) { - struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, WAN_GRAPH_SIZE, false); + struct ggml_cgraph* gf = new_graph_custom(WAN_GRAPH_SIZE); x = to_backend(x); timesteps = to_backend(timesteps);