diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index a4d6ca612..8dd7ce19c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -65,7 +65,7 @@ jobs:
 
       - name: Get commit hash
         id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: pr-mpt/actions-commit-hash@v2
 
       - name: Fetch system info
@@ -118,7 +118,7 @@ jobs:
 
       - name: Get commit hash
         id: commit
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: pr-mpt/actions-commit-hash@v2
 
       - name: Fetch system info
@@ -164,8 +164,6 @@ jobs:
             defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
           - build: "cuda12"
             defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;86;80;75"
-          # - build: "rocm5.5"
-          #   defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
           - build: 'vulkan'
             defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
     steps:
@@ -184,22 +182,9 @@ jobs:
           method: "network"
           sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
 
-      - name: Install rocm-toolkit
-        id: rocm-toolkit
-        if: ${{ matrix.build == 'rocm5.5' }}
-        uses: Cyberhan123/rocm-toolkit@v0.1.0
-        with:
-          rocm: "5.5.0"
-
-      - name: Install Ninja
-        id: install-ninja
-        if: ${{ matrix.build == 'rocm5.5' }}
-        uses: urkle/action-get-ninja@v1
-        with:
-          version: 1.11.1
       - name: Install Vulkan SDK
         id: get_vulkan
-        if: ${{ matrix.build == 'vulkan' }} https://sdk.lunarg.com/sdk/download/1.4.328.1/windows/vulkansdk-windows-X64-1.4.328.1.exe
+        if: ${{ matrix.build == 'vulkan' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/$%7Benv:VULKAN_VERSION%7D/windows/vulkansdk-windows-X64-$%7Benv:VULKAN_VERSION%7D.exe"
           & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@@ -277,6 +262,104 @@ jobs:
           path: |
             sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
 
+  windows-latest-cmake-hip:
+    runs-on: windows-2022
+
+    env:
+      HIPSDK_INSTALLER_VERSION: "25.Q3"
+      GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Cache ROCm Installation
+        id: cache-rocm
+        uses: actions/cache@v4
+        with:
+          path: C:\Program Files\AMD\ROCm
+          key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
+          evict-old-files: 1d
+
+      - name: Install ROCm
+        if: steps.cache-rocm.outputs.cache-hit != 'true'
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "Downloading AMD HIP SDK Installer"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-$%7B%7B%20env.HIPSDK_INSTALLER_VERSION%20%7D%7D-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          write-host "Installing AMD HIP SDK"
+          $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
+          $completed = $proc.WaitForExit(600000)
+          if (-not $completed) {
+              Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
+              $proc.Kill()
+              exit 1
+          }
+          if ($proc.ExitCode -ne 0) {
+              Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
+              exit 1
+          }
+          write-host "Completed AMD HIP SDK installation"
+
+      - name: Verify ROCm
+        run: |
+          # Find and test ROCm installation
+          $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+          if (-not $clangPath) {
+            Write-Error "ROCm installation not found"
+            exit 1
+          }
+          & $clangPath.FullName --version
+          # Set HIP_PATH environment variable for later steps
+          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+          cmake .. `
+            -G "Unix Makefiles" `
+            -DSD_HIPBLAS=ON `
+            -DSD_BUILD_SHARED_LIBS=ON `
+            -DGGML_NATIVE=OFF `
+            -DCMAKE_C_COMPILER=clang `
+            -DCMAKE_CXX_COMPILER=clang++ `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
+          cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          md "build\bin\rocblas\library\"
+          md "build\bin\hipblaslt\library"
+          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
+          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+          path: |
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+
   release:
     if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
 
@@ -286,6 +369,7 @@ jobs:
       - ubuntu-latest-cmake
       - macOS-latest-cmake
       - windows-latest-cmake
+      - windows-latest-cmake-hip
 
     steps:
       - name: Clone
diff --git a/README.md b/README.md
index 5cc6e4458..058cd5818 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,9 @@ API and command-line option may change frequently.***
     - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
     - `DPM++ 2S a`
     - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
-- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
+- Cross-platform reproducibility
+    - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
+    - `--rng cpu`, consistent with the `comfyui RNG`
 - Embedds generation parameters into png output as webui-compatible text string
 
 ## Quick Start
diff --git a/clip.hpp b/clip.hpp
index eb37638cc..e2a892ca0 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -936,7 +936,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
                                     size_t max_token_idx         = 0,
                                     bool return_pooled           = false,
                                     int clip_skip                = -1) {
-        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+        struct ggml_cgraph* gf = new_graph_custom(2048);
 
         input_ids = to_backend(input_ids);
 
diff --git a/common.hpp b/common.hpp
index c68ddafe5..dd8281f9e 100644
--- a/common.hpp
+++ b/common.hpp
@@ -182,31 +182,21 @@ class GEGLU : public UnaryBlock {
     int64_t dim_in;
     int64_t dim_out;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
-        enum ggml_type wtype      = get_type(prefix + "proj.weight", tensor_storage_map, GGML_TYPE_F32);
-        enum ggml_type bias_wtype = GGML_TYPE_F32;
-        params["proj.weight"]     = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
-        params["proj.bias"]       = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
-    }
-
 public:
     GEGLU(int64_t dim_in, int64_t dim_out)
-        : dim_in(dim_in), dim_out(dim_out) {}
+        : dim_in(dim_in), dim_out(dim_out) {
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
+    }
 
     struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
         // x: [ne3, ne2, ne1, dim_in]
         // return: [ne3, ne2, ne1, dim_out]
-        struct ggml_tensor* w = params["proj.weight"];
-        struct ggml_tensor* b = params["proj.bias"];
-
-        auto x_w    = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0);                        // [dim_out, dim_in]
-        auto x_b    = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, 0);                                            // [dim_out, dim_in]
-        auto gate_w = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2);  // [dim_out, ]
-        auto gate_b = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2);                      // [dim_out, ]
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
 
-        auto x_in = x;
-        x         = ggml_ext_linear(ctx->ggml_ctx, x_in, x_w, x_b);        // [ne3, ne2, ne1, dim_out]
-        auto gate = ggml_ext_linear(ctx->ggml_ctx, x_in, gate_w, gate_b);  // [ne3, ne2, ne1, dim_out]
+        x          = proj->forward(ctx, x);  // [ne3, ne2, ne1, dim_out*2]
+        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
+        x          = x_vec[0];  // [ne3, ne2, ne1, dim_out]
+        auto gate  = x_vec[1];  // [ne3, ne2, ne1, dim_out]
 
         gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
 
diff --git a/conditioner.hpp b/conditioner.hpp
index 93e0c2818..94e98a511 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -34,6 +34,7 @@ struct Conditioner {
     virtual void free_params_buffer()                                                      = 0;
     virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors)    = 0;
     virtual size_t get_params_buffer_size()                                                = 0;
+    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
     virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
                                                                                           int n_threads,
                                                                                           const ConditionerParams& conditioner_params) {
@@ -108,6 +109,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         return buffer_size;
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        text_model->set_weight_adapter(adapter);
+        if (sd_version_is_sdxl(version)) {
+            text_model2->set_weight_adapter(adapter);
+        }
+    }
+
     bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
         // the order matters
         ModelLoader model_loader;
@@ -270,13 +278,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             const std::string& curr_text = item.first;
             float curr_weight            = item.second;
             // printf(" %s: %f \n", curr_text.c_str(), curr_weight);
+            int32_t clean_index = 0;
+            if (curr_text == "BREAK" && curr_weight == -1.0f) {
+                // Pad token array up to chunk size at this point.
+                // TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
+                // Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
+                int padding_size = 75 - (tokens_acc % 75);
+                for (int j = 0; j < padding_size; j++) {
+                    clean_input_ids.push_back(tokenizer.EOS_TOKEN_ID);
+                    clean_index++;
+                }
+
+                // After padding, continue to the next iteration to process the following text as a new segment
+                tokens.insert(tokens.end(), clean_input_ids.begin(), clean_input_ids.end());
+                weights.insert(weights.end(), padding_size, curr_weight);
+                continue;
+            }
+
+            // Regular token, process normally
             std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
-            int32_t clean_index          = 0;
             for (uint32_t i = 0; i < curr_tokens.size(); i++) {
                 int token_id = curr_tokens[i];
-                if (token_id == image_token)
+                if (token_id == image_token) {
                     class_token_index.push_back(clean_index - 1);
-                else {
+                } else {
                     clean_input_ids.push_back(token_id);
                     clean_index++;
                 }
@@ -379,6 +404,22 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         for (const auto& item : parsed_attention) {
             const std::string& curr_text = item.first;
             float curr_weight            = item.second;
+
+            if (curr_text == "BREAK" && curr_weight == -1.0f) {
+                // Pad token array up to chunk size at this point.
+                // TODO: This is a hardcoded chunk_len, like in stable-diffusion.cpp, make it a parameter for the future?
+                // Also, this is 75 instead of 77 to leave room for BOS and EOS tokens.
+                size_t current_size = tokens.size();
+                size_t padding_size = (75 - (current_size % 75)) % 75;  // Ensure no negative padding
+
+                if (padding_size > 0) {
+                    LOG_DEBUG("BREAK token encountered, padding current chunk by %zu tokens.", padding_size);
+                    tokens.insert(tokens.end(), padding_size, tokenizer.EOS_TOKEN_ID);
+                    weights.insert(weights.end(), padding_size, 1.0f);
+                }
+                continue;  // Skip to the next item after handling BREAK
+            }
+
             std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
             tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
             weights.insert(weights.end(), curr_tokens.size(), curr_weight);
@@ -764,6 +805,18 @@ struct SD3CLIPEmbedder : public Conditioner {
         return buffer_size;
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (clip_l) {
+            clip_l->set_weight_adapter(adapter);
+        }
+        if (clip_g) {
+            clip_g->set_weight_adapter(adapter);
+        }
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
     std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
                                                                           size_t max_length = 0,
                                                                           bool padding      = false) {
@@ -1160,6 +1213,15 @@ struct FluxCLIPEmbedder : public Conditioner {
         return buffer_size;
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
+        if (clip_l) {
+            clip_l->set_weight_adapter(adapter);
+        }
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
     std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
                                                                           size_t max_length = 0,
                                                                           bool padding      = false) {
@@ -1400,6 +1462,12 @@ struct T5CLIPEmbedder : public Conditioner {
         return buffer_size;
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
     std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
                                                                                   size_t max_length = 0,
                                                                                   bool padding      = false) {
@@ -1589,6 +1657,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
         return buffer_size;
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (qwenvl) {
+            qwenvl->set_weight_adapter(adapter);
+        }
+    }
+
     std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
                                                               size_t max_length           = 0,
                                                               size_t system_prompt_length = 0,
diff --git a/control.hpp b/control.hpp
index b34140efb..d86f64cbf 100644
--- a/control.hpp
+++ b/control.hpp
@@ -380,7 +380,7 @@ struct ControlNet : public GGMLRunner {
                                     struct ggml_tensor* timesteps,
                                     struct ggml_tensor* context,
                                     struct ggml_tensor* y = nullptr) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
+        struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
 
         x = to_backend(x);
         if (guided_hint_cached) {
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 307049814..0a3914edc 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -35,8 +35,9 @@ struct DiffusionModel {
     virtual void free_compute_buffer()                                                  = 0;
     virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
     virtual size_t get_params_buffer_size()                                             = 0;
-    virtual int64_t get_adm_in_channels()                                               = 0;
-    virtual void set_flash_attn_enabled(bool enabled)                                   = 0;
+    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
+    virtual int64_t get_adm_in_channels()             = 0;
+    virtual void set_flash_attn_enabled(bool enabled) = 0;
 };
 
 struct UNetModel : public DiffusionModel {
@@ -73,6 +74,10 @@ struct UNetModel : public DiffusionModel {
         return unet.get_params_buffer_size();
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        unet.set_weight_adapter(adapter);
+    }
+
     int64_t get_adm_in_channels() override {
         return unet.unet.adm_in_channels;
     }
@@ -130,6 +135,10 @@ struct MMDiTModel : public DiffusionModel {
         return mmdit.get_params_buffer_size();
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        mmdit.set_weight_adapter(adapter);
+    }
+
     int64_t get_adm_in_channels() override {
         return 768 + 1280;
     }
@@ -188,6 +197,10 @@ struct FluxModel : public DiffusionModel {
         return flux.get_params_buffer_size();
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        flux.set_weight_adapter(adapter);
+    }
+
     int64_t get_adm_in_channels() override {
         return 768;
     }
@@ -251,6 +264,10 @@ struct WanModel : public DiffusionModel {
         return wan.get_params_buffer_size();
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        wan.set_weight_adapter(adapter);
+    }
+
     int64_t get_adm_in_channels() override {
         return 768;
     }
@@ -313,6 +330,10 @@ struct QwenImageModel : public DiffusionModel {
         return qwen_image.get_params_buffer_size();
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        qwen_image.set_weight_adapter(adapter);
+    }
+
     int64_t get_adm_in_channels() override {
         return 768;
     }
diff --git a/docs/lora.md b/docs/lora.md
index 9885ae549..fe4fbc0b3 100644
--- a/docs/lora.md
+++ b/docs/lora.md
@@ -12,38 +12,15 @@ Here's a simple example:
 
 `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
 
-# Support matrix
-
-> ℹ️ CUDA `get_rows` support is defined here:  
-> [ggml-org/ggml/src/ggml-cuda/getrows.cu#L156](https://github.com/ggml-org/ggml/blob/7dee1d6a1e7611f238d09be96738388da97c88ed/src/ggml-cuda/getrows.cu#L156)  
-> Currently only the basic types + Q4/Q5/Q8 are implemented. K-quants are **not** supported.
-
-NOTE: The other backends may have different support.
-
-| Quant / Type | CUDA | Vulkan |
-|--------------|------|--------|
-| F32          | ✔️   | ✔️   |
-| F16          | ✔️   | ✔️   |
-| BF16         | ✔️   | ✔️   |
-| I32          | ✔️   | ❌   |
-| Q4_0         | ✔️   | ✔️   |
-| Q4_1         | ✔️   | ✔️   |
-| Q5_0         | ✔️   | ✔️   |
-| Q5_1         | ✔️   | ✔️   |
-| Q8_0         | ✔️   | ✔️   |
-| Q2_K         | ❌   | ❌   |
-| Q3_K         | ❌   | ❌   |
-| Q4_K         | ❌   | ❌   |
-| Q5_K         | ❌   | ❌   |
-| Q6_K         | ❌   | ❌   |
-| Q8_K         | ❌   | ❌   |
-| IQ1_S        | ❌   | ✔️   |
-| IQ1_M        | ❌   | ✔️   |
-| IQ2_XXS      | ❌   | ✔️   |
-| IQ2_XS       | ❌   | ✔️   |
-| IQ2_S        | ❌   | ✔️   |
-| IQ3_XXS      | ❌   | ✔️   |
-| IQ3_S        | ❌   | ✔️   |
-| IQ4_XS       | ❌   | ✔️   |
-| IQ4_NL       | ❌   | ✔️   |
-| MXFP4        | ❌   | ✔️   |
+# Lora Apply Mode
+
+There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.
+
+By default, the mode is selected automatically:
+
+* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
+* Otherwise, the **immediately** mode is used.
+
+The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
+In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.
+
diff --git a/esrgan.hpp b/esrgan.hpp
index adce62342..fb09544e0 100644
--- a/esrgan.hpp
+++ b/esrgan.hpp
@@ -344,7 +344,7 @@ struct ESRGAN : public GGMLRunner {
         if (!rrdb_net)
             return nullptr;
         constexpr int kGraphNodes = 1 << 16;  // 65k
-        struct ggml_cgraph* gf    = ggml_new_graph_custom(compute_ctx, kGraphNodes, /*grads*/ false);
+        struct ggml_cgraph* gf    = new_graph_custom(kGraphNodes);
         x                         = to_backend(x);
 
         auto runner_ctx         = get_context();
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 00e0942f1..c9cb46b20 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -94,11 +94,18 @@ Options:
   -M, --mode                               run mode, one of [img_gen, vid_gen, upscale, convert], default: img_gen
   --type                                   weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K). If not specified, the default is the
                                            type of the weight file
-  --rng                                    RNG, one of [std_default, cuda], default: cuda
+  --rng                                    RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)
+  --sampler-rng                            sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng
   -s, --seed                               RNG seed (default: 42, use random seed for < 0)
   --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
                                            tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
   --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]
+  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
+                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
+                                           immediately will be used.The immediately mode may have precision and
+                                           compatibility issues with quantized parameters, but it usually offers faster inference
+                                           speed and, in some cases, lower memory usage. The at_runtime mode, on the other
+                                           hand, is exactly the opposite.
   --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
                                            discrete
   --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
@@ -113,4 +120,4 @@ Options:
   --vae-relative-tile-size                 relative tile size for vae tiling, format [X]x[Y], in fraction of image size if < 1, in number of tiles per dim if >=1
                                            (overrides --vae-tile-size)
   --preview                                preview method. must be one of the following [none, proj, tae, vae] (default is none)
-```
\ No newline at end of file
+```
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 619c42847..3cfe9281a 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -110,21 +110,22 @@ struct SDParams {
     int fps             = 16;
     float vace_strength = 1.f;
 
-    float strength             = 0.75f;
-    float control_strength     = 0.9f;
-    rng_type_t rng_type        = CUDA_RNG;
-    int64_t seed               = 42;
-    bool verbose               = false;
-    bool offload_params_to_cpu = false;
-    bool control_net_cpu       = false;
-    bool clip_on_cpu           = false;
-    bool vae_on_cpu            = false;
-    bool diffusion_flash_attn  = false;
-    bool diffusion_conv_direct = false;
-    bool vae_conv_direct       = false;
-    bool canny_preprocess      = false;
-    bool color                 = false;
-    int upscale_repeats        = 1;
+    float strength              = 0.75f;
+    float control_strength      = 0.9f;
+    rng_type_t rng_type         = CUDA_RNG;
+    rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
+    int64_t seed                = 42;
+    bool verbose                = false;
+    bool offload_params_to_cpu  = false;
+    bool control_net_cpu        = false;
+    bool clip_on_cpu            = false;
+    bool vae_on_cpu             = false;
+    bool diffusion_flash_attn   = false;
+    bool diffusion_conv_direct  = false;
+    bool vae_conv_direct        = false;
+    bool canny_preprocess       = false;
+    bool color                  = false;
+    int upscale_repeats         = 1;
 
     // Photo Maker
     std::string photo_maker_path;
@@ -137,7 +138,8 @@ struct SDParams {
     int chroma_t5_mask_pad   = 1;
     float flow_shift         = INFINITY;
 
-    prediction_t prediction = DEFAULT_PRED;
+    prediction_t prediction           = DEFAULT_PRED;
+    lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
 
     sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
     bool force_sdxl_vae_conv_scale       = false;
@@ -209,9 +211,11 @@ void print_params(SDParams params) {
     printf("    high_noise_sample_params:          %s\n", SAFE_STR(high_noise_sample_params_str));
     printf("    moe_boundary:                      %.3f\n", params.moe_boundary);
     printf("    prediction:                        %s\n", sd_prediction_name(params.prediction));
+    printf("    lora_apply_mode:                   %s\n", sd_lora_apply_mode_name(params.lora_apply_mode));
     printf("    flow_shift:                        %.2f\n", params.flow_shift);
     printf("    strength(img2img):                 %.2f\n", params.strength);
     printf("    rng:                               %s\n", sd_rng_type_name(params.rng_type));
+    printf("    sampler rng:                       %s\n", sd_rng_type_name(params.sampler_rng_type));
     printf("    seed:                              %zd\n", params.seed);
     printf("    batch_count:                       %d\n", params.batch_count);
     printf("    vae_tiling:                        %s\n", params.vae_tiling_params.enabled ? "true" : "false");
@@ -884,6 +888,20 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         return 1;
     };
 
+    auto on_sampler_rng_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg         = argv[index];
+        params.sampler_rng_type = str_to_rng_type(arg);
+        if (params.sampler_rng_type == RNG_TYPE_COUNT) {
+            fprintf(stderr, "error: invalid sampler rng type %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
     auto on_schedule_arg = [&](int argc, const char** argv, int index) {
         if (++index >= argc) {
             return -1;
@@ -926,6 +944,20 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         return 1;
     };
 
+    auto on_lora_apply_mode_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg        = argv[index];
+        params.lora_apply_mode = str_to_lora_apply_mode(arg);
+        if (params.lora_apply_mode == LORA_APPLY_MODE_COUNT) {
+            fprintf(stderr, "error: invalid lora apply model %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
     auto on_sample_method_arg = [&](int argc, const char** argv, int index) {
         if (++index >= argc) {
             return -1;
@@ -1108,8 +1140,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          on_type_arg},
         {"",
          "--rng",
-         "RNG, one of [std_default, cuda], default: cuda",
+         "RNG, one of [std_default, cuda, cpu], default: cuda(sd-webui), cpu(comfyui)",
          on_rng_arg},
+        {"",
+         "--sampler-rng",
+         "sampler RNG, one of [std_default, cuda, cpu]. If not specified, use --rng",
+         on_sampler_rng_arg},
         {"-s",
          "--seed",
          "RNG seed (default: 42, use random seed for < 0)",
@@ -1123,6 +1159,14 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "--prediction",
          "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]",
          on_prediction_arg},
+        {"",
+         "--lora-apply-mode",
+         "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. "
+         "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used."
+         "The immediately mode may have precision and compatibility issues with quantized parameters, "
+         "but it usually offers faster inference speed and, in some cases, lower memory usage. "
+         "The at_runtime mode, on the other hand, is exactly the opposite.",
+         on_lora_apply_mode_arg},
         {"",
          "--scheduler",
          "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete",
@@ -1217,10 +1261,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         exit(1);
     }
 
-    if (params.mode != CONVERT && params.tensor_type_rules.size() > 0) {
-        fprintf(stderr, "warning: --tensor-type-rules is currently supported only for conversion\n");
-    }
-
     if (params.mode == VID_GEN && params.video_frames <= 0) {
         fprintf(stderr, "warning: --video-frames must be at least 1\n");
         exit(1);
@@ -1299,6 +1339,9 @@ std::string get_image_params(SDParams params, int64_t seed) {
     parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
     parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
     parameter_string += "RNG: " + std::string(sd_rng_type_name(params.rng_type)) + ", ";
+    if (params.sampler_rng_type != RNG_TYPE_COUNT) {
+        parameter_string += "Sampler RNG: " + std::string(sd_rng_type_name(params.sampler_rng_type)) + ", ";
+    }
     parameter_string += "Sampler: " + std::string(sd_sample_method_name(params.sample_params.sample_method));
     if (params.sample_params.scheduler != DEFAULT) {
         parameter_string += " " + std::string(sd_schedule_name(params.sample_params.scheduler));
@@ -1732,12 +1775,15 @@ int main(int argc, const char* argv[]) {
         params.lora_model_dir.c_str(),
         params.embedding_dir.c_str(),
         params.photo_maker_path.c_str(),
+        params.tensor_type_rules.c_str(),
         vae_decode_only,
         true,
         params.n_threads,
         params.wtype,
         params.rng_type,
+        params.sampler_rng_type,
         params.prediction,
+        params.lora_apply_mode,
         params.offload_params_to_cpu,
         params.clip_on_cpu,
         params.control_net_cpu,
diff --git a/flux.hpp b/flux.hpp
index 8a255aa16..2f85cf8c1 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -1243,7 +1243,7 @@ namespace Flux {
                                         bool increase_ref_index               = false,
                                         std::vector<int> skip_layers          = {}) {
             GGML_ASSERT(x->ne[3] == 1);
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
+            struct ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE);
 
             struct ggml_tensor* mod_index_arange = nullptr;
             struct ggml_tensor* dct              = nullptr;  // for chroma radiance
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index eaf501656..aa1664589 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -959,12 +959,15 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_linear(struct ggml_context* ctx,
         int64_t ne3 = x->ne[3];
         x           = ggml_reshape_2d(ctx, x, x->ne[0], x->ne[1] * x->ne[2] * x->ne[3]);
         x           = ggml_mul_mat(ctx, w, x);
-        x           = ggml_reshape_4d(ctx, x, x->ne[0], x->ne[1] / ne2 / ne3, ne2, ne3);
+        if (force_prec_f32) {
+            ggml_mul_mat_set_prec(x, GGML_PREC_F32);
+        }
+        x = ggml_reshape_4d(ctx, x, x->ne[0], x->ne[1] / ne2 / ne3, ne2, ne3);
     } else {
         x = ggml_mul_mat(ctx, w, x);
-    }
-    if (force_prec_f32) {
-        ggml_mul_mat_set_prec(x, GGML_PREC_F32);
+        if (force_prec_f32) {
+            ggml_mul_mat_set_prec(x, GGML_PREC_F32);
+        }
     }
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, 1.f / scale);
@@ -1119,6 +1122,18 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
     return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3);
 }
 
+__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
+    auto out         = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
+    ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1);  // [1,]
+    if (ggml_is_transposed(out)) {
+        out = ggml_mul_mat(ctx, one, out);
+    } else {
+        out = ggml_mul_mat(ctx, out, one);
+    }
+    out = ggml_reshape(ctx, out, a);
+    return out;
+}
+
 // q: [N * n_head, n_token, d_head]
 // k: [N * n_head, n_k, d_head]
 // v: [N * n_head, d_head, n_k]
@@ -1460,11 +1475,43 @@ __STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) {
 #define MAX_PARAMS_TENSOR_NUM 32768
 #define MAX_GRAPH_SIZE 327680
 
+struct WeightAdapter {
+    struct ForwardParams {
+        enum class op_type_t {
+            OP_LINEAR,
+            OP_CONV2D,
+        } op_type;
+        struct {
+            bool force_prec_f32 = false;
+            float scale         = 1.f;
+        } linear;
+        struct {
+            int s0      = 1;
+            int s1      = 1;
+            int p0      = 0;
+            int p1      = 0;
+            int d0      = 1;
+            int d1      = 1;
+            bool direct = false;
+            float scale = 1.f;
+        } conv2d;
+    };
+    virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0;
+    virtual ggml_tensor* forward_with_lora(ggml_context* ctx,
+                                           ggml_tensor* x,
+                                           ggml_tensor* w,
+                                           ggml_tensor* b,
+                                           const std::string& prefix,
+                                           ForwardParams forward_params)                                      = 0;
+    virtual size_t get_extra_graph_size()                                                                     = 0;
+};
+
 struct GGMLRunnerContext {
-    ggml_backend_t backend     = nullptr;
-    ggml_context* ggml_ctx     = nullptr;
-    bool flash_attn_enabled    = false;
-    bool conv2d_direct_enabled = false;
+    ggml_backend_t backend                        = nullptr;
+    ggml_context* ggml_ctx                        = nullptr;
+    bool flash_attn_enabled                       = false;
+    bool conv2d_direct_enabled                    = false;
+    std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
 };
 
 struct GGMLRunner {
@@ -1486,6 +1533,8 @@ struct GGMLRunner {
     struct ggml_context* compute_ctx    = nullptr;
     struct ggml_gallocr* compute_allocr = nullptr;
 
+    std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
+
     std::vector<float> one_vec = {1.f};
     ggml_tensor* one_tensor    = nullptr;
 
@@ -1565,6 +1614,13 @@ struct GGMLRunner {
         ggml_build_forward_expand(gf, one_tensor);
     }
 
+    struct ggml_cgraph* new_graph_custom(size_t graph_size) {
+        if (weight_adapter) {
+            graph_size += weight_adapter->get_extra_graph_size();
+        }
+        return ggml_new_graph_custom(compute_ctx, graph_size, false);
+    }
+
     struct ggml_cgraph* get_compute_graph(get_graph_cb_t get_graph) {
         prepare_build_in_tensor_before();
         struct ggml_cgraph* gf = get_graph();
@@ -1760,6 +1816,7 @@ struct GGMLRunner {
         runner_ctx.backend               = runtime_backend;
         runner_ctx.flash_attn_enabled    = flash_attn_enabled;
         runner_ctx.conv2d_direct_enabled = conv2d_direct_enabled;
+        runner_ctx.weight_adapter        = weight_adapter;
         return runner_ctx;
     }
 
@@ -1891,6 +1948,10 @@ struct GGMLRunner {
     void set_conv2d_direct_enabled(bool enabled) {
         conv2d_direct_enabled = enabled;
     }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
+        weight_adapter = adapter;
+    }
 };
 
 class GGMLBlock {
@@ -2006,8 +2067,10 @@ class Linear : public UnaryBlock {
     bool force_f32;
     bool force_prec_f32;
     float scale;
+    std::string prefix;
 
     void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        this->prefix         = prefix;
         enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
         if (in_features % ggml_blck_size(wtype) != 0 || force_f32) {
             wtype = GGML_TYPE_F32;
@@ -2039,6 +2102,13 @@ class Linear : public UnaryBlock {
         if (bias) {
             b = params["bias"];
         }
+        if (ctx->weight_adapter) {
+            WeightAdapter::ForwardParams forward_params;
+            forward_params.op_type               = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
+            forward_params.linear.force_prec_f32 = force_prec_f32;
+            forward_params.linear.scale          = scale;
+            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
+        }
         return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
     }
 };
@@ -2098,8 +2168,10 @@ class Conv2d : public UnaryBlock {
     std::pair<int, int> dilation;
     bool bias;
     float scale = 1.f;
+    std::string prefix;
 
     void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
+        this->prefix         = prefix;
         enum ggml_type wtype = GGML_TYPE_F16;
         params["weight"]     = ggml_new_tensor_4d(ctx, wtype, kernel_size.second, kernel_size.first, in_channels, out_channels);
         if (bias) {
@@ -2138,6 +2210,19 @@ class Conv2d : public UnaryBlock {
         if (bias) {
             b = params["bias"];
         }
+        if (ctx->weight_adapter) {
+            WeightAdapter::ForwardParams forward_params;
+            forward_params.op_type       = WeightAdapter::ForwardParams::op_type_t::OP_CONV2D;
+            forward_params.conv2d.s0     = stride.second;
+            forward_params.conv2d.s1     = stride.first;
+            forward_params.conv2d.p0     = padding.second;
+            forward_params.conv2d.p1     = padding.first;
+            forward_params.conv2d.d0     = dilation.second;
+            forward_params.conv2d.d1     = dilation.first;
+            forward_params.conv2d.direct = ctx->conv2d_direct_enabled;
+            forward_params.conv2d.scale  = scale;
+            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
+        }
         return ggml_ext_conv_2d(ctx->ggml_ctx,
                                 x,
                                 w,
@@ -2209,8 +2294,10 @@ class Conv3d : public UnaryBlock {
     std::tuple<int, int, int> padding;
     std::tuple<int, int, int> dilation;
     bool bias;
+    std::string prefix;
 
     void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map, const std::string prefix = "") override {
+        this->prefix         = prefix;
         enum ggml_type wtype = GGML_TYPE_F16;
         params["weight"]     = ggml_new_tensor_4d(ctx,
                                                   wtype,
@@ -2242,8 +2329,17 @@ class Conv3d : public UnaryBlock {
     struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
         struct ggml_tensor* b = nullptr;
+        if (ctx->weight_adapter) {
+            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+            if (w->type != GGML_TYPE_F16) {
+                w = ggml_cast(ctx->ggml_ctx, w, GGML_TYPE_F16);
+            }
+        }
         if (bias) {
             b = params["bias"];
+            if (ctx->weight_adapter) {
+                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+            }
         }
         return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
                                 std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
@@ -2258,8 +2354,10 @@ class LayerNorm : public UnaryBlock {
     float eps;
     bool elementwise_affine;
     bool bias;
+    std::string prefix;
 
     void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        this->prefix = prefix;
         if (elementwise_affine) {
             enum ggml_type wtype = GGML_TYPE_F32;
             params["weight"]     = ggml_new_tensor_1d(ctx, wtype, normalized_shape);
@@ -2286,8 +2384,14 @@ class LayerNorm : public UnaryBlock {
 
         if (elementwise_affine) {
             w = params["weight"];
+            if (ctx->weight_adapter) {
+                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+            }
             if (bias) {
                 b = params["bias"];
+                if (ctx->weight_adapter) {
+                    b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+                }
             }
         }
         return ggml_ext_layer_norm(ctx->ggml_ctx, x, w, b, eps);
@@ -2300,8 +2404,10 @@ class GroupNorm : public GGMLBlock {
     int64_t num_channels;
     float eps;
     bool affine;
+    std::string prefix;
 
     void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        this->prefix = prefix;
         if (affine) {
             enum ggml_type wtype      = GGML_TYPE_F32;
             enum ggml_type bias_wtype = GGML_TYPE_F32;
@@ -2326,6 +2432,10 @@ class GroupNorm : public GGMLBlock {
         if (affine) {
             w = params["weight"];
             b = params["bias"];
+            if (ctx->weight_adapter) {
+                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+            }
         }
         return ggml_ext_group_norm(ctx->ggml_ctx, x, w, b, num_groups);
     }
@@ -2341,8 +2451,10 @@ class RMSNorm : public UnaryBlock {
 protected:
     int64_t hidden_size;
     float eps;
+    std::string prefix;
 
     void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+        this->prefix         = prefix;
         enum ggml_type wtype = GGML_TYPE_F32;
         params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
     }
@@ -2355,8 +2467,11 @@ class RMSNorm : public UnaryBlock {
 
     struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
         struct ggml_tensor* w = params["weight"];
-        x                     = ggml_rms_norm(ctx->ggml_ctx, x, eps);
-        x                     = ggml_mul_inplace(ctx->ggml_ctx, x, w);
+        if (ctx->weight_adapter) {
+            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+        }
+        x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+        x = ggml_mul_inplace(ctx->ggml_ctx, x, w);
         return x;
     }
 };
diff --git a/lora.hpp b/lora.hpp
index 6da9d833b..daabd4a01 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -7,22 +7,25 @@
 #define LORA_GRAPH_BASE_SIZE 10240
 
 struct LoraModel : public GGMLRunner {
+    std::string lora_id;
     float multiplier = 1.0f;
-    std::map<std::string, struct ggml_tensor*> lora_tensors;
+    std::unordered_map<std::string, struct ggml_tensor*> lora_tensors;
     std::map<ggml_tensor*, ggml_tensor*> original_tensor_to_final_tensor;
+    std::set<std::string> applied_lora_tensors;
     std::string file_path;
     ModelLoader model_loader;
-    bool load_failed                = false;
-    bool applied                    = false;
-    bool tensor_preprocessed        = false;
-    std::vector<int> zero_index_vec = {0};
-    ggml_tensor* zero_index         = nullptr;
+    bool load_failed         = false;
+    bool applied             = false;
+    bool tensor_preprocessed = false;
 
-    LoraModel(ggml_backend_t backend,
+    typedef std::function<bool(const std::string&)> filter_t;
+
+    LoraModel(const std::string& lora_id,
+              ggml_backend_t backend,
               const std::string& file_path = "",
               std::string prefix           = "",
               SDVersion version            = VERSION_COUNT)
-        : file_path(file_path), GGMLRunner(backend, false) {
+        : lora_id(lora_id), file_path(file_path), GGMLRunner(backend, false) {
         prefix = "lora." + prefix;
         if (!model_loader.init_from_file_and_convert_name(file_path, prefix, version)) {
             load_failed = true;
@@ -33,7 +36,7 @@ struct LoraModel : public GGMLRunner {
         return "lora";
     }
 
-    bool load_from_file(bool filter_tensor, int n_threads) {
+    bool load_from_file(int n_threads, filter_t filter = nullptr) {
         LOG_INFO("loading LoRA from '%s'", file_path.c_str());
 
         if (load_failed) {
@@ -48,7 +51,7 @@ struct LoraModel : public GGMLRunner {
             if (dry_run) {
                 const std::string& name = tensor_storage.name;
 
-                if (filter_tensor && !contains(name, "lora.model")) {
+                if (filter && !filter(name)) {
                     return true;
                 }
 
@@ -68,6 +71,10 @@ struct LoraModel : public GGMLRunner {
 
         model_loader.load_tensors(on_new_tensor_cb, n_threads);
 
+        if (tensors_to_create.empty()) {
+            return true;
+        }
+
         for (const auto& pair : tensors_to_create) {
             const auto& name         = pair.first;
             const auto& ts           = pair.second;
@@ -87,14 +94,6 @@ struct LoraModel : public GGMLRunner {
         return true;
     }
 
-    ggml_tensor* to_f32(ggml_context* ctx, ggml_tensor* a) {
-        auto out = ggml_reshape_1d(ctx, a, ggml_nelements(a));
-        out      = ggml_get_rows(ctx, out, zero_index);
-        out      = ggml_reshape(ctx, out, a);
-        // auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
-        return out;
-    }
-
     void preprocess_lora_tensors(const std::map<std::string, ggml_tensor*>& model_tensors) {
         if (tensor_preprocessed) {
             return;
@@ -102,7 +101,7 @@ struct LoraModel : public GGMLRunner {
         tensor_preprocessed = true;
         // I really hate these hardcoded processes.
         if (model_tensors.find("cond_stage_model.1.transformer.text_model.encoder.layers.0.self_attn.in_proj.weight") != model_tensors.end()) {
-            std::map<std::string, ggml_tensor*> new_lora_tensors;
+            std::unordered_map<std::string, ggml_tensor*> new_lora_tensors;
             for (auto& [old_name, tensor] : lora_tensors) {
                 std::string new_name = old_name;
 
@@ -130,7 +129,7 @@ struct LoraModel : public GGMLRunner {
         }
     }
 
-    ggml_tensor* get_lora_diff(const std::string& model_tensor_name, std::set<std::string>& applied_lora_tensors) {
+    ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
         ggml_tensor* updown = nullptr;
         int index           = 0;
         while (true) {
@@ -153,17 +152,17 @@ struct LoraModel : public GGMLRunner {
 
             auto iter = lora_tensors.find(lora_up_name);
             if (iter != lora_tensors.end()) {
-                lora_up = to_f32(compute_ctx, iter->second);
+                lora_up = ggml_ext_cast_f32(ctx, iter->second);
             }
 
             iter = lora_tensors.find(lora_mid_name);
             if (iter != lora_tensors.end()) {
-                lora_mid = to_f32(compute_ctx, iter->second);
+                lora_mid = ggml_ext_cast_f32(ctx, iter->second);
             }
 
             iter = lora_tensors.find(lora_down_name);
             if (iter != lora_tensors.end()) {
-                lora_down = to_f32(compute_ctx, iter->second);
+                lora_down = ggml_ext_cast_f32(ctx, iter->second);
             }
 
             if (lora_up == nullptr || lora_down == nullptr) {
@@ -195,32 +194,61 @@ struct LoraModel : public GGMLRunner {
             }
             scale_value *= multiplier;
 
-            auto curr_updown = ggml_ext_merge_lora(compute_ctx, lora_down, lora_up, lora_mid);
-            curr_updown      = ggml_scale_inplace(compute_ctx, curr_updown, scale_value);
+            auto curr_updown = ggml_ext_merge_lora(ctx, lora_down, lora_up, lora_mid);
+            curr_updown      = ggml_scale_inplace(ctx, curr_updown, scale_value);
 
             if (updown == nullptr) {
                 updown = curr_updown;
             } else {
-                updown = ggml_concat(compute_ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
+                updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
             }
 
             index++;
         }
+        return updown;
+    }
 
-        // diff
-        if (updown == nullptr) {
-            std::string lora_diff_name = "lora." + model_tensor_name + ".diff";
+    ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+        ggml_tensor* updown = nullptr;
+        int index           = 0;
+        while (true) {
+            std::string key;
+            if (index == 0) {
+                key = model_tensor_name;
+            } else {
+                key = model_tensor_name + "." + std::to_string(index);
+            }
+
+            std::string diff_name = "lora." + key + ".diff";
+
+            ggml_tensor* curr_updown = nullptr;
+
+            auto iter = lora_tensors.find(diff_name);
+            if (iter != lora_tensors.end()) {
+                curr_updown = ggml_ext_cast_f32(ctx, iter->second);
+            } else {
+                break;
+            }
+
+            applied_lora_tensors.insert(diff_name);
+
+            float scale_value = 1.0f;
+            scale_value *= multiplier;
+
+            curr_updown = ggml_scale_inplace(ctx, curr_updown, scale_value);
 
-            if (lora_tensors.find(lora_diff_name) != lora_tensors.end()) {
-                updown = to_f32(compute_ctx, lora_tensors[lora_diff_name]);
-                applied_lora_tensors.insert(lora_diff_name);
+            if (updown == nullptr) {
+                updown = curr_updown;
+            } else {
+                updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
             }
-        }
 
+            index++;
+        }
         return updown;
     }
 
-    ggml_tensor* get_loha_diff(const std::string& model_tensor_name, std::set<std::string>& applied_lora_tensors) {
+    ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
         ggml_tensor* updown = nullptr;
         int index           = 0;
         while (true) {
@@ -248,34 +276,34 @@ struct LoraModel : public GGMLRunner {
 
             auto iter = lora_tensors.find(hada_1_down_name);
             if (iter != lora_tensors.end()) {
-                hada_1_down = to_f32(compute_ctx, iter->second);
+                hada_1_down = ggml_ext_cast_f32(ctx, iter->second);
             }
 
             iter = lora_tensors.find(hada_1_up_name);
             if (iter != lora_tensors.end()) {
-                hada_1_up = to_f32(compute_ctx, iter->second);
+                hada_1_up = ggml_ext_cast_f32(ctx, iter->second);
             }
 
             iter = lora_tensors.find(hada_1_mid_name);
             if (iter != lora_tensors.end()) {
-                hada_1_mid = to_f32(compute_ctx, iter->second);
-                hada_1_up  = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_1_up));
+                hada_1_mid = ggml_ext_cast_f32(ctx, iter->second);
+                hada_1_up  = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up));
             }
 
             iter = lora_tensors.find(hada_2_down_name);
             if (iter != lora_tensors.end()) {
-                hada_2_down = to_f32(compute_ctx, iter->second);
+                hada_2_down = ggml_ext_cast_f32(ctx, iter->second);
             }
 
             iter = lora_tensors.find(hada_2_up_name);
             if (iter != lora_tensors.end()) {
-                hada_2_up = to_f32(compute_ctx, iter->second);
+                hada_2_up = ggml_ext_cast_f32(ctx, iter->second);
             }
 
             iter = lora_tensors.find(hada_2_mid_name);
             if (iter != lora_tensors.end()) {
-                hada_2_mid = to_f32(compute_ctx, iter->second);
-                hada_2_up  = ggml_cont(compute_ctx, ggml_transpose(compute_ctx, hada_2_up));
+                hada_2_mid = ggml_ext_cast_f32(ctx, iter->second);
+                hada_2_up  = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up));
             }
 
             if (hada_1_up == nullptr || hada_1_down == nullptr || hada_2_up == nullptr || hada_2_down == nullptr) {
@@ -309,21 +337,21 @@ struct LoraModel : public GGMLRunner {
             }
             scale_value *= multiplier;
 
-            struct ggml_tensor* updown_1 = ggml_ext_merge_lora(compute_ctx, hada_1_down, hada_1_up, hada_1_mid);
-            struct ggml_tensor* updown_2 = ggml_ext_merge_lora(compute_ctx, hada_2_down, hada_2_up, hada_2_mid);
-            auto curr_updown             = ggml_mul_inplace(compute_ctx, updown_1, updown_2);
-            curr_updown                  = ggml_scale_inplace(compute_ctx, curr_updown, scale_value);
+            struct ggml_tensor* updown_1 = ggml_ext_merge_lora(ctx, hada_1_down, hada_1_up, hada_1_mid);
+            struct ggml_tensor* updown_2 = ggml_ext_merge_lora(ctx, hada_2_down, hada_2_up, hada_2_mid);
+            auto curr_updown             = ggml_mul_inplace(ctx, updown_1, updown_2);
+            curr_updown                  = ggml_scale_inplace(ctx, curr_updown, scale_value);
             if (updown == nullptr) {
                 updown = curr_updown;
             } else {
-                updown = ggml_concat(compute_ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
+                updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
             }
             index++;
         }
         return updown;
     }
 
-    ggml_tensor* get_lokr_diff(const std::string& model_tensor_name, std::set<std::string>& applied_lora_tensors) {
+    ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
         ggml_tensor* updown = nullptr;
         int index           = 0;
         while (true) {
@@ -350,24 +378,24 @@ struct LoraModel : public GGMLRunner {
 
             auto iter = lora_tensors.find(lokr_w1_name);
             if (iter != lora_tensors.end()) {
-                lokr_w1 = to_f32(compute_ctx, iter->second);
+                lokr_w1 = ggml_ext_cast_f32(ctx, iter->second);
             }
 
             iter = lora_tensors.find(lokr_w2_name);
             if (iter != lora_tensors.end()) {
-                lokr_w2 = to_f32(compute_ctx, iter->second);
+                lokr_w2 = ggml_ext_cast_f32(ctx, iter->second);
             }
 
             int64_t rank = 1;
             if (lokr_w1 == nullptr) {
                 iter = lora_tensors.find(lokr_w1_a_name);
                 if (iter != lora_tensors.end()) {
-                    lokr_w1_a = to_f32(compute_ctx, iter->second);
+                    lokr_w1_a = ggml_ext_cast_f32(ctx, iter->second);
                 }
 
                 iter = lora_tensors.find(lokr_w1_b_name);
                 if (iter != lora_tensors.end()) {
-                    lokr_w1_b = to_f32(compute_ctx, iter->second);
+                    lokr_w1_b = ggml_ext_cast_f32(ctx, iter->second);
                 }
 
                 if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) {
@@ -376,18 +404,18 @@ struct LoraModel : public GGMLRunner {
 
                 rank = lokr_w1_b->ne[ggml_n_dims(lokr_w1_b) - 1];
 
-                lokr_w1 = ggml_ext_merge_lora(compute_ctx, lokr_w1_b, lokr_w1_a);
+                lokr_w1 = ggml_ext_merge_lora(ctx, lokr_w1_b, lokr_w1_a);
             }
 
             if (lokr_w2 == nullptr) {
                 iter = lora_tensors.find(lokr_w2_a_name);
                 if (iter != lora_tensors.end()) {
-                    lokr_w2_a = to_f32(compute_ctx, iter->second);
+                    lokr_w2_a = ggml_ext_cast_f32(ctx, iter->second);
                 }
 
                 iter = lora_tensors.find(lokr_w2_b_name);
                 if (iter != lora_tensors.end()) {
-                    lokr_w2_b = to_f32(compute_ctx, iter->second);
+                    lokr_w2_b = ggml_ext_cast_f32(ctx, iter->second);
                 }
 
                 if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) {
@@ -396,7 +424,7 @@ struct LoraModel : public GGMLRunner {
 
                 rank = lokr_w2_b->ne[ggml_n_dims(lokr_w2_b) - 1];
 
-                lokr_w2 = ggml_ext_merge_lora(compute_ctx, lokr_w2_b, lokr_w2_a);
+                lokr_w2 = ggml_ext_merge_lora(ctx, lokr_w2_b, lokr_w2_a);
             }
 
             if (!lokr_w1_a) {
@@ -427,49 +455,208 @@ struct LoraModel : public GGMLRunner {
 
             scale_value *= multiplier;
 
-            auto curr_updown = ggml_ext_kronecker(compute_ctx, lokr_w1, lokr_w2);
-            curr_updown      = ggml_scale_inplace(compute_ctx, curr_updown, scale_value);
+            auto curr_updown = ggml_ext_kronecker(ctx, lokr_w1, lokr_w2);
+            curr_updown      = ggml_scale_inplace(ctx, curr_updown, scale_value);
 
             if (updown == nullptr) {
                 updown = curr_updown;
             } else {
-                updown = ggml_concat(compute_ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
+                updown = ggml_concat(ctx, updown, curr_updown, ggml_n_dims(updown) - 1);
             }
             index++;
         }
         return updown;
     }
 
+    ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora = true) {
+        // lora
+        ggml_tensor* diff = nullptr;
+        if (with_lora) {
+            diff = get_lora_weight_diff(model_tensor_name, ctx);
+        }
+        // diff
+        if (diff == nullptr) {
+            diff = get_raw_weight_diff(model_tensor_name, ctx);
+        }
+        // loha
+        if (diff == nullptr) {
+            diff = get_loha_weight_diff(model_tensor_name, ctx);
+        }
+        // lokr
+        if (diff == nullptr) {
+            diff = get_lokr_weight_diff(model_tensor_name, ctx);
+        }
+        if (diff != nullptr) {
+            if (ggml_nelements(diff) < ggml_nelements(model_tensor)) {
+                if (ggml_n_dims(diff) == 2 && ggml_n_dims(model_tensor) == 2 && diff->ne[0] == model_tensor->ne[0]) {
+                    LOG_WARN("pad for %s", model_tensor_name.c_str());
+                    auto pad_tensor = ggml_ext_zeros(ctx, diff->ne[0], model_tensor->ne[1] - diff->ne[1], 1, 1);
+                    diff            = ggml_concat(ctx, diff, pad_tensor, 1);
+                }
+            }
+
+            GGML_ASSERT(ggml_nelements(diff) == ggml_nelements(model_tensor));
+            diff = ggml_reshape(ctx, diff, model_tensor);
+        }
+        return diff;
+    }
+
+    ggml_tensor* get_out_diff(ggml_context* ctx,
+                              ggml_tensor* x,
+                              WeightAdapter::ForwardParams forward_params,
+                              const std::string& model_tensor_name) {
+        ggml_tensor* out_diff = nullptr;
+        int index             = 0;
+        while (true) {
+            std::string key;
+            if (index == 0) {
+                key = model_tensor_name;
+            } else {
+                key = model_tensor_name + "." + std::to_string(index);
+            }
+
+            std::string lora_down_name = "lora." + key + ".lora_down";
+            std::string lora_up_name   = "lora." + key + ".lora_up";
+            std::string lora_mid_name  = "lora." + key + ".lora_mid";
+            std::string scale_name     = "lora." + key + ".scale";
+            std::string alpha_name     = "lora." + key + ".alpha";
+
+            ggml_tensor* lora_up   = nullptr;
+            ggml_tensor* lora_mid  = nullptr;
+            ggml_tensor* lora_down = nullptr;
+
+            bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D;
+
+            auto iter = lora_tensors.find(lora_up_name);
+            if (iter != lora_tensors.end()) {
+                lora_up = iter->second;
+                if (is_conv2d && lora_up->type != GGML_TYPE_F16) {
+                    lora_up = ggml_cast(ctx, lora_up, GGML_TYPE_F16);
+                }
+            }
+
+            iter = lora_tensors.find(lora_mid_name);
+            if (iter != lora_tensors.end()) {
+                lora_mid = iter->second;
+                if (is_conv2d && lora_mid->type != GGML_TYPE_F16) {
+                    lora_mid = ggml_cast(ctx, lora_mid, GGML_TYPE_F16);
+                }
+            }
+
+            iter = lora_tensors.find(lora_down_name);
+            if (iter != lora_tensors.end()) {
+                lora_down = iter->second;
+                if (is_conv2d && lora_down->type != GGML_TYPE_F16) {
+                    lora_down = ggml_cast(ctx, lora_down, GGML_TYPE_F16);
+                }
+            }
+
+            if (lora_up == nullptr || lora_down == nullptr) {
+                break;
+            }
+
+            applied_lora_tensors.insert(lora_up_name);
+            applied_lora_tensors.insert(lora_down_name);
+
+            if (lora_mid) {
+                applied_lora_tensors.insert(lora_mid_name);
+            }
+
+            float scale_value = 1.0f;
+
+            int64_t rank = lora_down->ne[ggml_n_dims(lora_down) - 1];
+            iter         = lora_tensors.find(scale_name);
+            if (iter != lora_tensors.end()) {
+                scale_value = ggml_ext_backend_tensor_get_f32(iter->second);
+                applied_lora_tensors.insert(scale_name);
+            } else {
+                iter = lora_tensors.find(alpha_name);
+                if (iter != lora_tensors.end()) {
+                    float alpha = ggml_ext_backend_tensor_get_f32(iter->second);
+                    scale_value = alpha / rank;
+                    // LOG_DEBUG("rank %s %ld %.2f %.2f", alpha_name.c_str(), rank, alpha, scale_value);
+                    applied_lora_tensors.insert(alpha_name);
+                }
+            }
+            scale_value *= multiplier;
+
+            ggml_tensor* lx;
+            if (!is_conv2d) {
+                lx = ggml_ext_linear(ctx, x, lora_down, nullptr, forward_params.linear.force_prec_f32, forward_params.linear.scale);
+                if (lora_mid) {
+                    lx = ggml_ext_linear(ctx, lx, lora_mid, nullptr, forward_params.linear.force_prec_f32, forward_params.linear.scale);
+                }
+                lx = ggml_ext_linear(ctx, lx, lora_up, nullptr, forward_params.linear.force_prec_f32, forward_params.linear.scale);
+            } else {  // OP_CONV2D
+                lx = ggml_ext_conv_2d(ctx,
+                                      x,
+                                      lora_down,
+                                      nullptr,
+                                      forward_params.conv2d.s0,
+                                      forward_params.conv2d.s1,
+                                      forward_params.conv2d.p0,
+                                      forward_params.conv2d.p1,
+                                      forward_params.conv2d.d0,
+                                      forward_params.conv2d.d1,
+                                      forward_params.conv2d.direct,
+                                      forward_params.conv2d.scale);
+                if (lora_mid) {
+                    lx = ggml_ext_conv_2d(ctx,
+                                          lx,
+                                          lora_mid,
+                                          nullptr,
+                                          1,
+                                          1,
+                                          0,
+                                          0,
+                                          1,
+                                          1,
+                                          forward_params.conv2d.direct,
+                                          forward_params.conv2d.scale);
+                }
+                lx = ggml_ext_conv_2d(ctx,
+                                      lx,
+                                      lora_up,
+                                      nullptr,
+                                      1,
+                                      1,
+                                      0,
+                                      0,
+                                      1,
+                                      1,
+                                      forward_params.conv2d.direct,
+                                      forward_params.conv2d.scale);
+            }
+
+            auto curr_out_diff = ggml_scale_inplace(ctx, lx, scale_value);
+
+            if (out_diff == nullptr) {
+                out_diff = curr_out_diff;
+            } else {
+                out_diff = ggml_concat(ctx, out_diff, curr_out_diff, ggml_n_dims(out_diff) - 1);
+            }
+
+            index++;
+        }
+        return out_diff;
+    }
+
     struct ggml_cgraph* build_lora_graph(const std::map<std::string, ggml_tensor*>& model_tensors, SDVersion version) {
         size_t lora_graph_size = LORA_GRAPH_BASE_SIZE + lora_tensors.size() * 10;
         struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, lora_graph_size, false);
 
-        zero_index = ggml_new_tensor_1d(compute_ctx, GGML_TYPE_I32, 1);
-        set_backend_tensor_data(zero_index, zero_index_vec.data());
-        ggml_build_forward_expand(gf, zero_index);
-
         preprocess_lora_tensors(model_tensors);
 
         original_tensor_to_final_tensor.clear();
+        applied_lora_tensors.clear();
 
-        std::set<std::string> applied_lora_tensors;
         for (auto it : model_tensors) {
             std::string model_tensor_name = it.first;
             ggml_tensor* model_tensor     = it.second;
 
             // lora
-            ggml_tensor* updown = get_lora_diff(model_tensor_name, applied_lora_tensors);
-            // loha
-            if (updown == nullptr) {
-                updown = get_loha_diff(model_tensor_name, applied_lora_tensors);
-            }
-
-            // lokr
-            if (updown == nullptr) {
-                updown = get_lokr_diff(model_tensor_name, applied_lora_tensors);
-            }
-
-            if (updown == nullptr) {
+            ggml_tensor* diff = get_weight_diff(model_tensor_name, compute_ctx, model_tensor);
+            if (diff == nullptr) {
                 continue;
             }
 
@@ -479,38 +666,49 @@ struct LoraModel : public GGMLRunner {
                 set_backend_tensor_data(model_tensor, original_tensor->data);
             }
 
-            if (ggml_nelements(updown) < ggml_nelements(model_tensor)) {
-                if (ggml_n_dims(updown) == 2 && ggml_n_dims(model_tensor) == 2 && updown->ne[0] == model_tensor->ne[0]) {
-                    LOG_WARN("pad for %s", model_tensor_name.c_str());
-                    auto pad_tensor = ggml_ext_zeros(compute_ctx, updown->ne[0], model_tensor->ne[1] - updown->ne[1], 1, 1);
-                    updown          = ggml_concat(compute_ctx, updown, pad_tensor, 1);
-                }
-            }
-
-            GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(model_tensor));
-            updown = ggml_reshape(compute_ctx, updown, model_tensor);
             ggml_tensor* final_tensor;
             if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) {
-                final_tensor = to_f32(compute_ctx, model_tensor);
-                final_tensor = ggml_add_inplace(compute_ctx, final_tensor, updown);
+                final_tensor = ggml_ext_cast_f32(compute_ctx, model_tensor);
+                final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff);
                 final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor);
             } else {
-                final_tensor = ggml_add_inplace(compute_ctx, model_tensor, updown);
+                final_tensor = ggml_add_inplace(compute_ctx, model_tensor, diff);
             }
             ggml_build_forward_expand(gf, final_tensor);
             if (!ggml_backend_is_cpu(runtime_backend) && ggml_backend_buffer_is_host(original_tensor->buffer)) {
                 original_tensor_to_final_tensor[original_tensor] = final_tensor;
             }
         }
+        return gf;
+    }
+
+    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
+        auto get_graph = [&]() -> struct ggml_cgraph* {
+            return build_lora_graph(model_tensors, version);
+        };
+        GGMLRunner::compute(get_graph, n_threads, false);
+        stat();
+        for (auto item : original_tensor_to_final_tensor) {
+            ggml_tensor* original_tensor = item.first;
+            ggml_tensor* final_tensor    = item.second;
+
+            ggml_backend_tensor_copy(final_tensor, original_tensor);
+        }
+        original_tensor_to_final_tensor.clear();
+        GGMLRunner::free_compute_buffer();
+    }
+
+    void stat(bool at_runntime = false) {
         size_t total_lora_tensors_count   = 0;
         size_t applied_lora_tensors_count = 0;
 
         for (auto& kv : lora_tensors) {
             total_lora_tensors_count++;
             if (applied_lora_tensors.find(kv.first) == applied_lora_tensors.end()) {
-                LOG_WARN("unused lora tensor |%s|", kv.first.c_str());
-                print_ggml_tensor(kv.second, true);
-                // exit(0);
+                if (!at_runntime) {
+                    LOG_WARN("unused lora tensor |%s|", kv.first.c_str());
+                    print_ggml_tensor(kv.second, true);
+                }
             } else {
                 applied_lora_tensors_count++;
             }
@@ -518,30 +716,87 @@ struct LoraModel : public GGMLRunner {
         /* Don't worry if this message shows up twice in the logs per LoRA,
          * this function is called once to calculate the required buffer size
          * and then again to actually generate a graph to be used */
-        if (applied_lora_tensors_count != total_lora_tensors_count) {
-            LOG_WARN("Only (%lu / %lu) LoRA tensors will be applied",
-                     applied_lora_tensors_count, total_lora_tensors_count);
+        if (!at_runntime && applied_lora_tensors_count != total_lora_tensors_count) {
+            LOG_WARN("Only (%lu / %lu) LoRA tensors have been applied, lora_file_path = %s",
+                     applied_lora_tensors_count, total_lora_tensors_count, file_path.c_str());
         } else {
-            LOG_DEBUG("(%lu / %lu) LoRA tensors will be applied",
-                      applied_lora_tensors_count, total_lora_tensors_count);
+            LOG_INFO("(%lu / %lu) LoRA tensors have been applied, lora_file_path = %s",
+                     applied_lora_tensors_count, total_lora_tensors_count, file_path.c_str());
         }
+    }
+};
 
-        return gf;
+struct MultiLoraAdapter : public WeightAdapter {
+protected:
+    std::vector<std::shared_ptr<LoraModel>> lora_models;
+
+public:
+    explicit MultiLoraAdapter(const std::vector<std::shared_ptr<LoraModel>>& lora_models)
+        : lora_models(lora_models) {
     }
 
-    void apply(std::map<std::string, struct ggml_tensor*> model_tensors, SDVersion version, int n_threads) {
-        auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_lora_graph(model_tensors, version);
-        };
-        GGMLRunner::compute(get_graph, n_threads, false);
-        for (auto item : original_tensor_to_final_tensor) {
-            ggml_tensor* original_tensor = item.first;
-            ggml_tensor* final_tensor    = item.second;
+    ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora) {
+        for (auto& lora_model : lora_models) {
+            ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora);
+            if (diff == nullptr) {
+                continue;
+            }
 
-            ggml_backend_tensor_copy(final_tensor, original_tensor);
+            if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
+                weight = ggml_ext_cast_f32(ctx, weight);
+            }
+            weight = ggml_add(ctx, weight, diff);
         }
-        original_tensor_to_final_tensor.clear();
-        GGMLRunner::free_compute_buffer();
+        return weight;
+    }
+
+    ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) override {
+        return patch_weight(ctx, weight, weight_name, true);
+    }
+
+    ggml_tensor* forward_with_lora(ggml_context* ctx,
+                                   ggml_tensor* x,
+                                   ggml_tensor* w,
+                                   ggml_tensor* b,
+                                   const std::string& prefix,
+                                   WeightAdapter::ForwardParams forward_params) override {
+        w = patch_weight(ctx, w, prefix + "weight", false);
+        if (b) {
+            b = patch_weight(ctx, b, prefix + "bias", false);
+        }
+        ggml_tensor* out;
+        if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) {
+            out = ggml_ext_linear(ctx, x, w, b, forward_params.linear.force_prec_f32, forward_params.linear.scale);
+        } else {  // OP_CONV2D
+            out = ggml_ext_conv_2d(ctx,
+                                   x,
+                                   w,
+                                   b,
+                                   forward_params.conv2d.s0,
+                                   forward_params.conv2d.s1,
+                                   forward_params.conv2d.p0,
+                                   forward_params.conv2d.p1,
+                                   forward_params.conv2d.d0,
+                                   forward_params.conv2d.d1,
+                                   forward_params.conv2d.direct,
+                                   forward_params.conv2d.scale);
+        }
+        for (auto& lora_model : lora_models) {
+            ggml_tensor* out_diff = lora_model->get_out_diff(ctx, x, forward_params, prefix + "weight");
+            if (out_diff == nullptr) {
+                continue;
+            }
+            out = ggml_add_inplace(ctx, out, out_diff);
+        }
+        return out;
+    }
+
+    size_t get_extra_graph_size() override {
+        size_t lora_tensor_num = 0;
+        for (auto& lora_model : lora_models) {
+            lora_tensor_num += lora_model->lora_tensors.size();
+        }
+        return LORA_GRAPH_BASE_SIZE + lora_tensor_num * 10;
     }
 };
 
diff --git a/mmdit.hpp b/mmdit.hpp
index 3ca01d952..c243e034a 100644
--- a/mmdit.hpp
+++ b/mmdit.hpp
@@ -870,7 +870,7 @@ struct MMDiTRunner : public GGMLRunner {
                                     struct ggml_tensor* context,
                                     struct ggml_tensor* y,
                                     std::vector<int> skip_layers = std::vector<int>()) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, MMDIT_GRAPH_SIZE, false);
+        struct ggml_cgraph* gf = new_graph_custom(MMDIT_GRAPH_SIZE);
 
         x         = to_backend(x);
         context   = to_backend(context);
diff --git a/model.cpp b/model.cpp
index 519284e6a..dac6e88f5 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1254,15 +1254,59 @@ std::map<ggml_type, uint32_t> ModelLoader::get_vae_wtype_stat() {
     return wtype_stat;
 }
 
-void ModelLoader::set_wtype_override(ggml_type wtype, std::string prefix) {
+static std::vector<std::pair<std::string, ggml_type>> parse_tensor_type_rules(const std::string& tensor_type_rules) {
+    std::vector<std::pair<std::string, ggml_type>> result;
+    for (const auto& item : split_string(tensor_type_rules, ',')) {
+        if (item.size() == 0)
+            continue;
+        std::string::size_type pos = item.find('=');
+        if (pos == std::string::npos) {
+            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
+            continue;
+        }
+        std::string tensor_pattern = item.substr(0, pos);
+        std::string type_name      = item.substr(pos + 1);
+
+        ggml_type tensor_type = GGML_TYPE_COUNT;
+
+        if (type_name == "f32") {
+            tensor_type = GGML_TYPE_F32;
+        } else {
+            for (size_t i = 0; i < GGML_TYPE_COUNT; i++) {
+                auto trait = ggml_get_type_traits((ggml_type)i);
+                if (trait->to_float && trait->type_size && type_name == trait->type_name) {
+                    tensor_type = (ggml_type)i;
+                }
+            }
+        }
+
+        if (tensor_type != GGML_TYPE_COUNT) {
+            result.emplace_back(tensor_pattern, tensor_type);
+        } else {
+            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
+        }
+    }
+    return result;
+}
+
+void ModelLoader::set_wtype_override(ggml_type wtype, std::string tensor_type_rules) {
+    auto map_rules = parse_tensor_type_rules(tensor_type_rules);
     for (auto& [name, tensor_storage] : tensor_storage_map) {
-        if (!starts_with(name, prefix)) {
+        ggml_type dst_type = wtype;
+        for (const auto& tensor_type_rule : map_rules) {
+            std::regex pattern(tensor_type_rule.first);
+            if (std::regex_search(name, pattern)) {
+                dst_type = tensor_type_rule.second;
+                break;
+            }
+        }
+        if (dst_type == GGML_TYPE_COUNT) {
             continue;
         }
-        if (!tensor_should_be_converted(tensor_storage, wtype)) {
+        if (!tensor_should_be_converted(tensor_storage, dst_type)) {
             continue;
         }
-        tensor_storage.expected_type = wtype;
+        tensor_storage.expected_type = dst_type;
     }
 }
 
@@ -1603,41 +1647,6 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
     return true;
 }
 
-std::vector<std::pair<std::string, ggml_type>> parse_tensor_type_rules(const std::string& tensor_type_rules) {
-    std::vector<std::pair<std::string, ggml_type>> result;
-    for (const auto& item : split_string(tensor_type_rules, ',')) {
-        if (item.size() == 0)
-            continue;
-        std::string::size_type pos = item.find('=');
-        if (pos == std::string::npos) {
-            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
-            continue;
-        }
-        std::string tensor_pattern = item.substr(0, pos);
-        std::string type_name      = item.substr(pos + 1);
-
-        ggml_type tensor_type = GGML_TYPE_COUNT;
-
-        if (type_name == "f32") {
-            tensor_type = GGML_TYPE_F32;
-        } else {
-            for (size_t i = 0; i < GGML_TYPE_COUNT; i++) {
-                auto trait = ggml_get_type_traits((ggml_type)i);
-                if (trait->to_float && trait->type_size && type_name == trait->type_name) {
-                    tensor_type = (ggml_type)i;
-                }
-            }
-        }
-
-        if (tensor_type != GGML_TYPE_COUNT) {
-            result.emplace_back(tensor_pattern, tensor_type);
-        } else {
-            LOG_WARN("ignoring invalid quant override \"%s\"", item.c_str());
-        }
-    }
-    return result;
-}
-
 bool ModelLoader::tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type) {
     const std::string& name = tensor_storage.name;
     if (type != GGML_TYPE_COUNT) {
diff --git a/model.h b/model.h
index 588f98219..2ac079fb5 100644
--- a/model.h
+++ b/model.h
@@ -292,7 +292,7 @@ class ModelLoader {
     std::map<ggml_type, uint32_t> get_diffusion_model_wtype_stat();
     std::map<ggml_type, uint32_t> get_vae_wtype_stat();
     String2TensorStorage& get_tensor_storage_map() { return tensor_storage_map; }
-    void set_wtype_override(ggml_type wtype, std::string prefix = "");
+    void set_wtype_override(ggml_type wtype, std::string tensor_type_rules = "");
     bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                       std::set<std::string> ignore_tensors = {},
diff --git a/name_conversion.cpp b/name_conversion.cpp
index ea2702a71..c50baa518 100644
--- a/name_conversion.cpp
+++ b/name_conversion.cpp
@@ -855,6 +855,49 @@ std::string convert_sep_to_dot(std::string name) {
     return name;
 }
 
+std::vector<std::string> cond_stage_model_prefix_vec = {
+    "cond_stage_model.1.",
+    "cond_stage_model.",
+    "conditioner.embedders.",
+    "text_encoders.",
+};
+
+std::vector<std::string> diffuison_model_prefix_vec = {
+    "model.diffusion_model.",
+};
+
+std::vector<std::string> first_stage_model_prefix_vec = {
+    "first_stage_model.",
+    "vae.",
+};
+
+bool is_cond_stage_model_name(const std::string& name) {
+    for (const auto& prefix : cond_stage_model_prefix_vec) {
+        if (starts_with(name, prefix) || starts_with(name, "lora." + prefix)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool is_diffusion_model_name(const std::string& name) {
+    for (const auto& prefix : diffuison_model_prefix_vec) {
+        if (starts_with(name, prefix) || starts_with(name, "lora." + prefix)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool is_first_stage_model_name(const std::string& name) {
+    for (const auto& prefix : first_stage_model_prefix_vec) {
+        if (starts_with(name, prefix) || starts_with(name, "lora." + prefix)) {
+            return true;
+        }
+    }
+    return false;
+}
+
 std::string convert_tensor_name(std::string name, SDVersion version) {
     bool is_lora                             = false;
     bool is_lycoris_underline                = false;
@@ -956,9 +999,6 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
 
     // diffusion model
     {
-        std::vector<std::string> diffuison_model_prefix_vec = {
-            "model.diffusion_model.",
-        };
         for (const auto& prefix : diffuison_model_prefix_vec) {
             if (starts_with(name, prefix)) {
                 name = convert_diffusion_model_name(name.substr(prefix.size()), prefix, version);
@@ -970,12 +1010,6 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
 
     // cond_stage_model
     {
-        std::vector<std::string> cond_stage_model_prefix_vec = {
-            "cond_stage_model.1.",
-            "cond_stage_model.",
-            "conditioner.embedders.",
-            "text_encoders.",
-        };
         for (const auto& prefix : cond_stage_model_prefix_vec) {
             if (starts_with(name, prefix)) {
                 name = convert_cond_stage_model_name(name.substr(prefix.size()), prefix);
@@ -987,10 +1021,6 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
 
     // first_stage_model
     {
-        std::vector<std::string> first_stage_model_prefix_vec = {
-            "first_stage_model.",
-            "vae.",
-        };
         for (const auto& prefix : first_stage_model_prefix_vec) {
             if (starts_with(name, prefix)) {
                 name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
diff --git a/name_conversion.h b/name_conversion.h
index eb3d1a9b9..3fefcf780 100644
--- a/name_conversion.h
+++ b/name_conversion.h
@@ -5,6 +5,10 @@
 
 #include "model.h"
 
+bool is_cond_stage_model_name(const std::string& name);
+bool is_diffusion_model_name(const std::string& name);
+bool is_first_stage_model_name(const std::string& name);
+
 std::string convert_tensor_name(std::string name, SDVersion version);
 
 #endif  // __NAME_CONVERSTION_H__
\ No newline at end of file
diff --git a/qwen_image.hpp b/qwen_image.hpp
index 87d2fb9b2..94ada47d7 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -543,7 +543,7 @@ namespace Qwen {
                                         std::vector<ggml_tensor*> ref_latents = {},
                                         bool increase_ref_index               = false) {
             GGML_ASSERT(x->ne[3] == 1);
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, QWEN_IMAGE_GRAPH_SIZE, false);
+            struct ggml_cgraph* gf = new_graph_custom(QWEN_IMAGE_GRAPH_SIZE);
 
             x         = to_backend(x);
             context   = to_backend(context);
diff --git a/qwenvl.hpp b/qwenvl.hpp
index 0a914f6c9..9bc268466 100644
--- a/qwenvl.hpp
+++ b/qwenvl.hpp
@@ -1049,7 +1049,7 @@ namespace Qwen {
         }
 
         struct ggml_cgraph* build_encode_image_graph(struct ggml_tensor* image) {
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, QWENVL_GRAPH_SIZE, false);
+            struct ggml_cgraph* gf = new_graph_custom(QWENVL_GRAPH_SIZE);
 
             GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
             GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0);
diff --git a/rng_mt19937.hpp b/rng_mt19937.hpp
new file mode 100644
index 000000000..7e6199886
--- /dev/null
+++ b/rng_mt19937.hpp
@@ -0,0 +1,147 @@
+#ifndef __RNG_MT19937_HPP__
+#define __RNG_MT19937_HPP__
+
+#include <cmath>
+#include <vector>
+
+#include "rng.hpp"
+
+// RNG imitiating torch cpu randn on CPU.
+// Port from pytorch, original license: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/LICENSE
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/TransformationHelper.h, for uniform_real
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/native/cpu/DistributionTemplates.h, for normal_kernel/normal_fill/normal_fill_16
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/MT19937RNGEngine.h, for mt19937_engine
+// Ref: https://github.com/pytorch/pytorch/blob/d01a7b0241ed1c4cded7e7ca097249feb343f072/aten/src/ATen/core/DistributionsHelper.h, for uniform_real_distribution/normal_distribution
+class MT19937RNG : public RNG {
+    static const int N             = 624;
+    static const int M             = 397;
+    static const uint32_t MATRIX_A = 0x9908b0dfU;
+    static const uint32_t UMASK    = 0x80000000U;
+    static const uint32_t LMASK    = 0x7fffffffU;
+
+    struct State {
+        uint64_t seed_;
+        int left_;
+        bool seeded_;
+        uint32_t next_;
+        std::array<uint32_t, N> state_;
+        bool has_next_gauss = false;
+        double next_gauss   = 0.0f;
+    };
+
+    State s;
+
+    uint32_t mix_bits(uint32_t u, uint32_t v) { return (u & UMASK) | (v & LMASK); }
+    uint32_t twist(uint32_t u, uint32_t v) { return (mix_bits(u, v) >> 1) ^ ((v & 1) ? MATRIX_A : 0); }
+    void next_state() {
+        uint32_t* p = s.state_.data();
+        s.left_     = N;
+        s.next_     = 0;
+        for (int j = N - M + 1; --j; p++)
+            p[0] = p[M] ^ twist(p[0], p[1]);
+        for (int j = M; --j; p++)
+            p[0] = p[M - N] ^ twist(p[0], p[1]);
+        p[0] = p[M - N] ^ twist(p[0], s.state_[0]);
+    }
+
+    uint32_t rand_uint32() {
+        if (--s.left_ == 0)
+            next_state();
+        uint32_t y = s.state_[s.next_++];
+        y ^= (y >> 11);
+        y ^= (y << 7) & 0x9d2c5680U;
+        y ^= (y << 15) & 0xefc60000U;
+        y ^= (y >> 18);
+        return y;
+    }
+
+    uint64_t rand_uint64() {
+        uint64_t high = (uint64_t)rand_uint32();
+        uint64_t low  = (uint64_t)rand_uint32();
+        return (high << 32) | low;
+    }
+
+    template <typename T, typename V>
+    T uniform_real(V val, T from, T to) {
+        constexpr auto MASK    = static_cast<V>((static_cast<uint64_t>(1) << std::numeric_limits<T>::digits) - 1);
+        constexpr auto DIVISOR = static_cast<T>(1) / (static_cast<uint64_t>(1) << std::numeric_limits<T>::digits);
+        T x                    = (val & MASK) * DIVISOR;
+        return (x * (to - from) + from);
+    }
+
+    double normal_double_value(double mean, double std) {
+        if (s.has_next_gauss) {
+            s.has_next_gauss = false;
+            return s.next_gauss;
+        }
+        double u1 = uniform_real(rand_uint64(), 0., 1.);  // double
+        double u2 = uniform_real(rand_uint64(), 0., 1.);  // double
+
+        double r         = std::sqrt(-2.0 * std::log1p(-u2));
+        double theta     = 2.0 * 3.14159265358979323846 * u1;
+        double value     = r * std::cos(theta) * std + mean;
+        s.next_gauss     = r * std::sin(theta) * std + mean;
+        s.has_next_gauss = true;
+        return value;
+    }
+
+    void normal_fill_16(float* data, float mean, float std) {
+        for (int j = 0; j < 8; ++j) {
+            float u1    = 1.0f - data[j];
+            float u2    = data[j + 8];
+            float r     = std::sqrt(-2.0f * std::log(u1));
+            float theta = 2.0f * 3.14159265358979323846 * u2;
+            data[j]     = r * std::cos(theta) * std + mean;
+            data[j + 8] = r * std::sin(theta) * std + mean;
+        }
+    }
+
+    void randn(float* data, int64_t size, float mean = 0.0f, float std = 1.0f) {
+        if (size >= 16) {
+            for (int64_t i = 0; i < size; i++) {
+                data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
+            }
+            for (int64_t i = 0; i < size - 15; i += 16) {
+                normal_fill_16(data + i, mean, std);
+            }
+            if (size % 16 != 0) {
+                // Recompute the last 16 values.
+                data = data + size - 16;
+                for (int64_t i = 0; i < 16; i++) {
+                    data[i] = uniform_real(rand_uint32(), 0.f, 1.f);
+                }
+                normal_fill_16(data, mean, std);
+            }
+        } else {
+            // Strange handling, hard to understand, but keeping it consistent with PyTorch.
+            for (int64_t i = 0; i < size; i++) {
+                data[i] = (float)normal_double_value(mean, std);
+            }
+        }
+    }
+
+public:
+    MT19937RNG(uint64_t seed = 0) { manual_seed(seed); }
+
+    void manual_seed(uint64_t seed) override {
+        s.seed_     = seed;
+        s.seeded_   = true;
+        s.state_[0] = (uint32_t)(seed & 0xffffffffU);
+        for (int j = 1; j < N; j++) {
+            uint32_t prev = s.state_[j - 1];
+            s.state_[j]   = 1812433253U * (prev ^ (prev >> 30)) + j;
+        }
+        s.left_          = 1;
+        s.next_          = 0;
+        s.has_next_gauss = false;
+    }
+
+    std::vector<float> randn(uint32_t n) override {
+        std::vector<float> out;
+        out.resize(n);
+        randn((float*)out.data(), out.size());
+        return out;
+    }
+};
+
+#endif  // __RNG_MT19937_HPP__
\ No newline at end of file
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 4cea83a19..d28aee179 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -2,6 +2,7 @@
 
 #include "model.h"
 #include "rng.hpp"
+#include "rng_mt19937.hpp"
 #include "rng_philox.hpp"
 #include "stable-diffusion.h"
 #include "util.h"
@@ -17,6 +18,7 @@
 #include "vae.hpp"
 
 #include "latent-preview.h"
+#include "name_conversion.h"
 
 const char* model_version_to_str[] = {
     "SD 1.x",
@@ -97,10 +99,11 @@ class StableDiffusionGGML {
     bool vae_decode_only         = false;
     bool free_params_immediately = false;
 
-    std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
-    int n_threads            = -1;
-    float scale_factor       = 0.18215f;
-    float shift_factor       = 0.f;
+    std::shared_ptr<RNG> rng         = std::make_shared<PhiloxRNG>();
+    std::shared_ptr<RNG> sampler_rng = nullptr;
+    int n_threads                    = -1;
+    float scale_factor               = 0.18215f;
+    float shift_factor               = 0.f;
 
     std::shared_ptr<Conditioner> cond_stage_model;
     std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision;  // for svd or wan2.1 i2v
@@ -108,10 +111,14 @@ class StableDiffusionGGML {
     std::shared_ptr<DiffusionModel> high_noise_diffusion_model;
     std::shared_ptr<VAE> first_stage_model;
     std::shared_ptr<TinyAutoEncoder> tae_first_stage;
-    std::shared_ptr<ControlNet> control_net = nullptr;
+    std::shared_ptr<ControlNet> control_net;
     std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
     std::shared_ptr<LoraModel> pmid_lora;
     std::shared_ptr<PhotoMakerIDEmbed> pmid_id_embeds;
+    std::vector<std::shared_ptr<LoraModel>> cond_stage_lora_models;
+    std::vector<std::shared_ptr<LoraModel>> diffusion_lora_models;
+    std::vector<std::shared_ptr<LoraModel>> first_stage_lora_models;
+    bool apply_lora_immediately = false;
 
     std::string taesd_path;
     bool use_tiny_autoencoder            = false;
@@ -182,6 +189,16 @@ class StableDiffusionGGML {
         }
     }
 
+    std::shared_ptr<RNG> get_rng(rng_type_t rng_type) {
+        if (rng_type == STD_DEFAULT_RNG) {
+            return std::make_shared<STDDefaultRNG>();
+        } else if (rng_type == CPU_RNG) {
+            return std::make_shared<MT19937RNG>();
+        } else {  // default: CUDA_RNG
+            return std::make_shared<PhiloxRNG>();
+        }
+    }
+
     bool init(const sd_ctx_params_t* sd_ctx_params) {
         n_threads               = sd_ctx_params->n_threads;
         vae_decode_only         = sd_ctx_params->vae_decode_only;
@@ -191,10 +208,11 @@ class StableDiffusionGGML {
         use_tiny_autoencoder    = taesd_path.size() > 0;
         offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
 
-        if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
-            rng = std::make_shared<STDDefaultRNG>();
-        } else if (sd_ctx_params->rng_type == CUDA_RNG) {
-            rng = std::make_shared<PhiloxRNG>();
+        rng = get_rng(sd_ctx_params->rng_type);
+        if (sd_ctx_params->sampler_rng_type != RNG_TYPE_COUNT) {
+            sampler_rng = get_rng(sd_ctx_params->sampler_rng_type);
+        } else {
+            sampler_rng = rng;
         }
 
         ggml_log_set(ggml_log_callback_default, nullptr);
@@ -296,11 +314,12 @@ class StableDiffusionGGML {
         }
 
         LOG_INFO("Version: %s ", model_version_to_str[version]);
-        ggml_type wtype = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
-                              ? (ggml_type)sd_ctx_params->wtype
-                              : GGML_TYPE_COUNT;
-        if (wtype != GGML_TYPE_COUNT) {
-            model_loader.set_wtype_override(wtype);
+        ggml_type wtype               = (int)sd_ctx_params->wtype < std::min<int>(SD_TYPE_COUNT, GGML_TYPE_COUNT)
+                                            ? (ggml_type)sd_ctx_params->wtype
+                                            : GGML_TYPE_COUNT;
+        std::string tensor_type_rules = SAFE_STR(sd_ctx_params->tensor_type_rules);
+        if (wtype != GGML_TYPE_COUNT || tensor_type_rules.size() > 0) {
+            model_loader.set_wtype_override(wtype, tensor_type_rules);
         }
 
         std::map<ggml_type, uint32_t> wtype_stat                 = model_loader.get_wtype_stat();
@@ -329,6 +348,29 @@ class StableDiffusionGGML {
 
         LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
 
+        if (sd_ctx_params->lora_apply_mode == LORA_APPLY_AUTO) {
+            bool have_quantized_weight = false;
+            if (wtype != GGML_TYPE_COUNT && ggml_is_quantized(wtype)) {
+                have_quantized_weight = true;
+            } else {
+                for (const auto& [type, _] : wtype_stat) {
+                    if (ggml_is_quantized(type)) {
+                        have_quantized_weight = true;
+                        break;
+                    }
+                }
+            }
+            if (have_quantized_weight) {
+                apply_lora_immediately = false;
+            } else {
+                apply_lora_immediately = true;
+            }
+        } else if (sd_ctx_params->lora_apply_mode == LORA_APPLY_IMMEDIATELY) {
+            apply_lora_immediately = true;
+        } else {
+            apply_lora_immediately = false;
+        }
+
         if (sd_version_is_sdxl(version)) {
             scale_factor = 0.13025f;
         } else if (sd_version_is_sd3(version)) {
@@ -571,8 +613,14 @@ class StableDiffusionGGML {
                                                                    version);
             }
             if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
-                pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "", version);
-                if (!pmid_lora->load_from_file(true, n_threads)) {
+                pmid_lora               = std::make_shared<LoraModel>("pmid", backend, sd_ctx_params->photo_maker_path, "", version);
+                auto lora_tensor_filter = [&](const std::string& tensor_name) {
+                    if (starts_with(tensor_name, "lora.model")) {
+                        return true;
+                    }
+                    return false;
+                };
+                if (!pmid_lora->load_from_file(n_threads, lora_tensor_filter)) {
                     LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
                     return false;
                 }
@@ -907,8 +955,11 @@ class StableDiffusionGGML {
         return result < -1;
     }
 
-    void apply_lora(std::string lora_name, float multiplier) {
-        int64_t t0                 = ggml_time_ms();
+    std::shared_ptr<LoraModel> load_lora_model_from_file(const std::string& lora_id,
+                                                         float multiplier,
+                                                         ggml_backend_t backend,
+                                                         LoraModel::filter_t lora_tensor_filter = nullptr) {
+        std::string lora_name      = lora_id;
         std::string high_noise_tag = "|high_noise|";
         bool is_high_noise         = false;
         if (starts_with(lora_name, high_noise_tag)) {
@@ -925,25 +976,19 @@ class StableDiffusionGGML {
             file_path = ckpt_file_path;
         } else {
             LOG_WARN("can not find %s or %s for lora %s", st_file_path.c_str(), ckpt_file_path.c_str(), lora_name.c_str());
-            return;
+            return nullptr;
         }
-        LoraModel lora(backend, file_path, is_high_noise ? "model.high_noise_" : "", version);
-        if (!lora.load_from_file(false, n_threads)) {
+        auto lora = std::make_shared<LoraModel>(lora_id, backend, file_path, is_high_noise ? "model.high_noise_" : "", version);
+        if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
             LOG_WARN("load lora tensors from %s failed", file_path.c_str());
-            return;
+            return nullptr;
         }
 
-        lora.multiplier = multiplier;
-        // TODO: send version?
-        lora.apply(tensors, version, n_threads);
-        lora.free_params_buffer();
-
-        int64_t t1 = ggml_time_ms();
-
-        LOG_INFO("lora '%s' applied, taking %.2fs", lora_name.c_str(), (t1 - t0) * 1.0f / 1000);
+        lora->multiplier = multiplier;
+        return lora;
     }
 
-    void apply_loras(const std::unordered_map<std::string, float>& lora_state) {
+    void apply_loras_immediately(const std::unordered_map<std::string, float>& lora_state) {
         std::unordered_map<std::string, float> lora_state_diff;
         for (auto& kv : lora_state) {
             const std::string& lora_name = kv.first;
@@ -964,12 +1009,149 @@ class StableDiffusionGGML {
         }
 
         for (auto& kv : lora_state_diff) {
-            apply_lora(kv.first, kv.second);
+            int64_t t0 = ggml_time_ms();
+
+            auto lora = load_lora_model_from_file(kv.first, kv.second, backend);
+            lora->apply(tensors, version, n_threads);
+            lora->free_params_buffer();
+
+            int64_t t1 = ggml_time_ms();
+
+            LOG_INFO("lora '%s' applied, taking %.2fs", kv.first.c_str(), (t1 - t0) * 1.0f / 1000);
         }
 
         curr_lora_state = lora_state;
     }
 
+    void apply_loras_at_runtime(const std::unordered_map<std::string, float>& lora_state) {
+        cond_stage_lora_models.clear();
+        diffusion_lora_models.clear();
+        first_stage_lora_models.clear();
+        if (cond_stage_model) {
+            std::vector<std::shared_ptr<LoraModel>> lora_models;
+            auto lora_state_diff = lora_state;
+            for (auto& lora_model : cond_stage_lora_models) {
+                auto iter = lora_state_diff.find(lora_model->lora_id);
+
+                if (iter != lora_state_diff.end()) {
+                    lora_model->multiplier = iter->second;
+                    lora_models.push_back(lora_model);
+                    lora_state_diff.erase(iter);
+                }
+            }
+            cond_stage_lora_models  = lora_models;
+            auto lora_tensor_filter = [&](const std::string& tensor_name) {
+                if (is_cond_stage_model_name(tensor_name)) {
+                    return true;
+                }
+                return false;
+            };
+            for (auto& kv : lora_state_diff) {
+                const std::string& lora_id = kv.first;
+                float multiplier           = kv.second;
+
+                auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter);
+                if (lora && !lora->lora_tensors.empty()) {
+                    lora->preprocess_lora_tensors(tensors);
+                    cond_stage_lora_models.push_back(lora);
+                }
+            }
+            auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(cond_stage_lora_models);
+            cond_stage_model->set_weight_adapter(multi_lora_adapter);
+        }
+        if (diffusion_model) {
+            std::vector<std::shared_ptr<LoraModel>> lora_models;
+            auto lora_state_diff = lora_state;
+            for (auto& lora_model : diffusion_lora_models) {
+                auto iter = lora_state_diff.find(lora_model->lora_id);
+
+                if (iter != lora_state_diff.end()) {
+                    lora_model->multiplier = iter->second;
+                    lora_models.push_back(lora_model);
+                    lora_state_diff.erase(iter);
+                }
+            }
+            diffusion_lora_models   = lora_models;
+            auto lora_tensor_filter = [&](const std::string& tensor_name) {
+                if (is_diffusion_model_name(tensor_name)) {
+                    return true;
+                }
+                return false;
+            };
+            for (auto& kv : lora_state_diff) {
+                const std::string& lora_name = kv.first;
+                float multiplier             = kv.second;
+
+                auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter);
+                if (lora && !lora->lora_tensors.empty()) {
+                    lora->preprocess_lora_tensors(tensors);
+                    diffusion_lora_models.push_back(lora);
+                }
+            }
+            auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(diffusion_lora_models);
+            diffusion_model->set_weight_adapter(multi_lora_adapter);
+            if (high_noise_diffusion_model) {
+                high_noise_diffusion_model->set_weight_adapter(multi_lora_adapter);
+            }
+        }
+
+        if (first_stage_model) {
+            std::vector<std::shared_ptr<LoraModel>> lora_models;
+            auto lora_state_diff = lora_state;
+            for (auto& lora_model : first_stage_lora_models) {
+                auto iter = lora_state_diff.find(lora_model->lora_id);
+
+                if (iter != lora_state_diff.end()) {
+                    lora_model->multiplier = iter->second;
+                    lora_models.push_back(lora_model);
+                    lora_state_diff.erase(iter);
+                }
+            }
+            first_stage_lora_models = lora_models;
+            auto lora_tensor_filter = [&](const std::string& tensor_name) {
+                if (is_first_stage_model_name(tensor_name)) {
+                    return true;
+                }
+                return false;
+            };
+            for (auto& kv : lora_state_diff) {
+                const std::string& lora_name = kv.first;
+                float multiplier             = kv.second;
+
+                auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, lora_tensor_filter);
+                if (lora && !lora->lora_tensors.empty()) {
+                    lora->preprocess_lora_tensors(tensors);
+                    first_stage_lora_models.push_back(lora);
+                }
+            }
+            auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(first_stage_lora_models);
+            first_stage_model->set_weight_adapter(multi_lora_adapter);
+        }
+    }
+
+    void lora_stat() {
+        if (!cond_stage_lora_models.empty()) {
+            LOG_INFO("cond_stage_lora_models:");
+            for (auto& lora_model : cond_stage_lora_models) {
+                lora_model->stat();
+            }
+        }
+
+        if (!diffusion_lora_models.empty()) {
+            LOG_INFO("diffusion_lora_models:");
+            for (auto& lora_model : diffusion_lora_models) {
+                lora_model->stat();
+            }
+        }
+
+        if (!first_stage_lora_models.empty()) {
+            LOG_INFO("first_stage_lora_models:");
+            for (auto& lora_model : first_stage_lora_models) {
+                lora_model->stat();
+            }
+        }
+    }
+
     std::string apply_loras_from_prompt(const std::string& prompt) {
         auto result_pair                                = extract_and_remove_lora(prompt);
         std::unordered_map<std::string, float> lora_f2m = result_pair.first;  // lora_name -> multiplier
@@ -978,10 +1160,18 @@ class StableDiffusionGGML {
             LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second);
         }
         int64_t t0 = ggml_time_ms();
-        apply_loras(lora_f2m);
+        if (apply_lora_immediately) {
+            LOG_INFO("apply lora immediately");
+            apply_loras_immediately(lora_f2m);
+        } else {
+            LOG_INFO("apply at runtime");
+            apply_loras_at_runtime(lora_f2m);
+        }
         int64_t t1 = ggml_time_ms();
-        LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
-        LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str());
+        if (!lora_f2m.empty()) {
+            LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
+            LOG_DEBUG("prompt after extract and remove lora: \"%s\"", result_pair.second.c_str());
+        }
         return result_pair.second;
     }
 
@@ -1556,7 +1746,7 @@ class StableDiffusionGGML {
             return denoised;
         };
 
-        sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta);
+        sample_k_diffusion(method, denoise, work_ctx, x, sigmas, sampler_rng, eta);
 
         if (inverse_noise_scaling) {
             x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
@@ -1955,6 +2145,7 @@ enum sd_type_t str_to_sd_type(const char* str) {
 const char* rng_type_to_str[] = {
     "std_default",
     "cuda",
+    "cpu",
 };
 
 const char* sd_rng_type_name(enum rng_type_t rng_type) {
@@ -2081,6 +2272,28 @@ enum preview_t str_to_preview(const char* str) {
     return PREVIEW_COUNT;
 }
 
+const char* lora_apply_mode_to_str[] = {
+    "auto",
+    "immediately",
+    "at_runtime",
+};
+
+const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode) {
+    if (mode < LORA_APPLY_MODE_COUNT) {
+        return lora_apply_mode_to_str[mode];
+    }
+    return NONE_STR;
+}
+
+enum lora_apply_mode_t str_to_lora_apply_mode(const char* str) {
+    for (int i = 0; i < LORA_APPLY_MODE_COUNT; i++) {
+        if (!strcmp(str, lora_apply_mode_to_str[i])) {
+            return (enum lora_apply_mode_t)i;
+        }
+    }
+    return LORA_APPLY_MODE_COUNT;
+}
+
 void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     *sd_ctx_params                         = {};
     sd_ctx_params->vae_decode_only         = true;
@@ -2088,7 +2301,9 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->n_threads               = get_num_physical_cores();
     sd_ctx_params->wtype                   = SD_TYPE_COUNT;
     sd_ctx_params->rng_type                = CUDA_RNG;
+    sd_ctx_params->sampler_rng_type        = RNG_TYPE_COUNT;
     sd_ctx_params->prediction              = DEFAULT_PRED;
+    sd_ctx_params->lora_apply_mode         = LORA_APPLY_AUTO;
     sd_ctx_params->offload_params_to_cpu   = false;
     sd_ctx_params->keep_clip_on_cpu        = false;
     sd_ctx_params->keep_control_net_on_cpu = false;
@@ -2122,11 +2337,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "lora_model_dir: %s\n"
              "embedding_dir: %s\n"
              "photo_maker_path: %s\n"
+             "tensor_type_rules: %s\n"
              "vae_decode_only: %s\n"
              "free_params_immediately: %s\n"
              "n_threads: %d\n"
              "wtype: %s\n"
              "rng_type: %s\n"
+             "sampler_rng_type: %s\n"
              "prediction: %s\n"
              "offload_params_to_cpu: %s\n"
              "keep_clip_on_cpu: %s\n"
@@ -2151,11 +2368,13 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              SAFE_STR(sd_ctx_params->lora_model_dir),
              SAFE_STR(sd_ctx_params->embedding_dir),
              SAFE_STR(sd_ctx_params->photo_maker_path),
+             SAFE_STR(sd_ctx_params->tensor_type_rules),
              BOOL_STR(sd_ctx_params->vae_decode_only),
              BOOL_STR(sd_ctx_params->free_params_immediately),
              sd_ctx_params->n_threads,
              sd_type_name(sd_ctx_params->wtype),
              sd_rng_type_name(sd_ctx_params->rng_type),
+             sd_rng_type_name(sd_ctx_params->sampler_rng_type),
              sd_prediction_name(sd_ctx_params->prediction),
              BOOL_STR(sd_ctx_params->offload_params_to_cpu),
              BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
@@ -2454,18 +2673,24 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
                 LOG_WARN("Turn off PhotoMaker");
                 sd_ctx->sd->stacked_id = false;
             } else {
-                id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask);
-                int64_t t1          = ggml_time_ms();
-                LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
-                if (sd_ctx->sd->free_params_immediately) {
-                    sd_ctx->sd->pmid_model->free_params_buffer();
-                }
-                // Encode input prompt without the trigger word for delayed conditioning
-                prompt_text_only = sd_ctx->sd->cond_stage_model->remove_trigger_from_prompt(work_ctx, prompt);
-                // printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str());
-                prompt = prompt_text_only;  //
-                if (sample_steps < 50) {
-                    LOG_WARN("It's recommended to use >= 50 steps for photo maker!");
+                if (pm_params.id_images_count != id_embeds->ne[1]) {
+                    LOG_WARN("PhotoMaker image count (%d) does NOT match ID embeds (%d). You should run face_detect.py again.", pm_params.id_images_count, id_embeds->ne[1]);
+                    LOG_WARN("Turn off PhotoMaker");
+                    sd_ctx->sd->stacked_id = false;
+                } else {
+                    id_cond.c_crossattn = sd_ctx->sd->id_encoder(work_ctx, init_img, id_cond.c_crossattn, id_embeds, class_tokens_mask);
+                    int64_t t1          = ggml_time_ms();
+                    LOG_INFO("Photomaker ID Stacking, taking %" PRId64 " ms", t1 - t0);
+                    if (sd_ctx->sd->free_params_immediately) {
+                        sd_ctx->sd->pmid_model->free_params_buffer();
+                    }
+                    // Encode input prompt without the trigger word for delayed conditioning
+                    prompt_text_only = sd_ctx->sd->cond_stage_model->remove_trigger_from_prompt(work_ctx, prompt);
+                    // printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str());
+                    prompt = prompt_text_only;  //
+                    if (sample_steps < 50) {
+                        LOG_WARN("It's recommended to use >= 50 steps for photo maker!");
+                    }
                 }
             }
         } else {
@@ -2611,6 +2836,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
         LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
 
         sd_ctx->sd->rng->manual_seed(cur_seed);
+        sd_ctx->sd->sampler_rng->manual_seed(cur_seed);
         struct ggml_tensor* x_t   = init_latent;
         struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
         ggml_ext_im_set_randn_f32(noise, sd_ctx->sd->rng);
@@ -2674,6 +2900,9 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
     if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) {
         sd_ctx->sd->first_stage_model->free_params_buffer();
     }
+
+    sd_ctx->sd->lora_stat();
+
     sd_image_t* result_images = (sd_image_t*)calloc(batch_count, sizeof(sd_image_t));
     if (result_images == nullptr) {
         ggml_free(work_ctx);
@@ -2734,6 +2963,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
         seed = rand();
     }
     sd_ctx->sd->rng->manual_seed(seed);
+    sd_ctx->sd->sampler_rng->manual_seed(seed);
 
     int sample_steps = sd_img_gen_params->sample_params.sample_steps;
 
@@ -3025,6 +3255,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     }
 
     sd_ctx->sd->rng->manual_seed(seed);
+    sd_ctx->sd->sampler_rng->manual_seed(seed);
 
     int64_t t0 = ggml_time_ms();
 
@@ -3343,6 +3574,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
         sd_ctx->sd->first_stage_model->free_params_buffer();
     }
 
+    sd_ctx->sd->lora_stat();
+
     sd_image_t* result_images = (sd_image_t*)calloc(vid->ne[2], sizeof(sd_image_t));
     if (result_images == nullptr) {
         ggml_free(work_ctx);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 9e99d53de..6be85af22 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -31,6 +31,7 @@ extern "C" {
 enum rng_type_t {
     STD_DEFAULT_RNG,
     CUDA_RNG,
+    CPU_RNG,
     RNG_TYPE_COUNT
 };
 
@@ -134,6 +135,13 @@ enum preview_t {
     PREVIEW_COUNT
 };
 
+enum lora_apply_mode_t {
+    LORA_APPLY_AUTO,
+    LORA_APPLY_IMMEDIATELY,
+    LORA_APPLY_AT_RUNTIME,
+    LORA_APPLY_MODE_COUNT,
+};
+
 typedef struct {
     bool enabled;
     int tile_size_x;
@@ -159,12 +167,15 @@ typedef struct {
     const char* lora_model_dir;
     const char* embedding_dir;
     const char* photo_maker_path;
+    const char* tensor_type_rules;
     bool vae_decode_only;
     bool free_params_immediately;
     int n_threads;
     enum sd_type_t wtype;
     enum rng_type_t rng_type;
+    enum rng_type_t sampler_rng_type;
     enum prediction_t prediction;
+    enum lora_apply_mode_t lora_apply_mode;
     bool offload_params_to_cpu;
     bool keep_clip_on_cpu;
     bool keep_control_net_on_cpu;
@@ -267,7 +278,7 @@ typedef void (*sd_preview_cb_t)(int step, int frame_count, sd_image_t* frames, b
 
 SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data);
 SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data);
-SD_API void sd_set_preview_callback(sd_preview_cb_t cb, preview_t mode, int interval, bool denoised, bool noisy);
+SD_API void sd_set_preview_callback(sd_preview_cb_t cb, enum preview_t mode, int interval, bool denoised, bool noisy);
 SD_API int32_t get_num_physical_cores();
 SD_API const char* sd_get_system_info();
 
@@ -283,6 +294,8 @@ SD_API const char* sd_prediction_name(enum prediction_t prediction);
 SD_API enum prediction_t str_to_prediction(const char* str);
 SD_API const char* sd_preview_name(enum preview_t preview);
 SD_API enum preview_t str_to_preview(const char* str);
+SD_API const char* sd_lora_apply_mode_name(enum lora_apply_mode_t mode);
+SD_API enum lora_apply_mode_t str_to_lora_apply_mode(const char* str);
 
 SD_API void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params);
 SD_API char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params);
diff --git a/unet.hpp b/unet.hpp
index 8f0adf389..de05f464d 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -7,7 +7,7 @@
 
 /*==================================================== UnetModel =====================================================*/
 
-#define UNET_GRAPH_SIZE 10240
+#define UNET_GRAPH_SIZE 102400
 
 class SpatialVideoTransformer : public SpatialTransformer {
 protected:
@@ -612,7 +612,7 @@ struct UNetModelRunner : public GGMLRunner {
                                     int num_video_frames                      = -1,
                                     std::vector<struct ggml_tensor*> controls = {},
                                     float control_strength                    = 0.f) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, UNET_GRAPH_SIZE, false);
+        struct ggml_cgraph* gf = new_graph_custom(UNET_GRAPH_SIZE);
 
         if (num_video_frames == -1) {
             num_video_frames = x->ne[3];
diff --git a/util.cpp b/util.cpp
index 1aa9beff8..c46216646 100644
--- a/util.cpp
+++ b/util.cpp
@@ -5,6 +5,7 @@
 #include <cstdarg>
 #include <fstream>
 #include <locale>
+#include <regex>
 #include <sstream>
 #include <string>
 #include <thread>
@@ -547,6 +548,8 @@ sd_image_f32_t clip_preprocess(sd_image_f32_t image, int target_width, int targe
 //   (abc) - increases attention to abc by a multiplier of 1.1
 //   (abc:3.12) - increases attention to abc by a multiplier of 3.12
 //   [abc] - decreases attention to abc by a multiplier of 1.1
+//   BREAK - separates the prompt into conceptually distinct parts for sequential processing
+//   B - internal helper pattern; prevents 'B' in 'BREAK' from being consumed as normal text
 //   \( - literal character '('
 //   \[ - literal character '['
 //   \) - literal character ')'
@@ -582,7 +585,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
     float round_bracket_multiplier  = 1.1f;
     float square_bracket_multiplier = 1 / 1.1f;
 
-    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|[^\\()\[\]:]+|:)");
+    std::regex re_attention(R"(\\\(|\\\)|\\\[|\\\]|\\\\|\\|\(|\[|:([+-]?[.\d]+)\)|\)|\]|\bBREAK\b|[^\\()\[\]:B]+|:|\bB)");
     std::regex re_break(R"(\s*\bBREAK\b\s*)");
 
     auto multiply_range = [&](int start_position, float multiplier) {
@@ -591,7 +594,7 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
         }
     };
 
-    std::smatch m;
+    std::smatch m, m2;
     std::string remaining_text = text;
 
     while (std::regex_search(remaining_text, m, re_attention)) {
@@ -615,6 +618,8 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
             square_brackets.pop_back();
         } else if (text == "\\(") {
             res.push_back({text.substr(1), 1.0f});
+        } else if (std::regex_search(text, m2, re_break)) {
+            res.push_back({"BREAK", -1.0f});
         } else {
             res.push_back({text, 1.0f});
         }
@@ -645,4 +650,4 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
     }
 
     return res;
-}
\ No newline at end of file
+}
diff --git a/wan.hpp b/wan.hpp
index 91a2e9205..41882e790 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -1133,7 +1133,7 @@ namespace WAN {
         }
 
         struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, 10240 * z->ne[2], false);
+            struct ggml_cgraph* gf = new_graph_custom(10240 * z->ne[2]);
 
             z = to_backend(z);
 
@@ -1147,7 +1147,7 @@ namespace WAN {
         }
 
         struct ggml_cgraph* build_graph_partial(struct ggml_tensor* z, bool decode_graph, int64_t i) {
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, 20480, false);
+            struct ggml_cgraph* gf = new_graph_custom(20480);
 
             ae.clear_cache();
 
@@ -2142,7 +2142,7 @@ namespace WAN {
                                         struct ggml_tensor* time_dim_concat = nullptr,
                                         struct ggml_tensor* vace_context    = nullptr,
                                         float vace_strength                 = 1.f) {
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, WAN_GRAPH_SIZE, false);
+            struct ggml_cgraph* gf = new_graph_custom(WAN_GRAPH_SIZE);
 
             x               = to_backend(x);
             timesteps       = to_backend(timesteps);