git-deprecated
diff --git a/‎clip.hpp‎
Lines changed: 1 addition & 1 deletion b/‎clip.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common.hpp‎
Lines changed: 8 additions & 18 deletions b/‎common.hpp‎
Lines changed: 8 additions & 18 deletions
diff --git a/‎conditioner.hpp‎
Lines changed: 41 additions & 0 deletions b/‎conditioner.hpp‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎control.hpp‎
Lines changed: 1 addition & 1 deletion b/‎control.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎diffusion_model.hpp‎
Lines changed: 23 additions & 2 deletions b/‎diffusion_model.hpp‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎docs/lora.md‎
Lines changed: 12 additions & 35 deletions b/‎docs/lora.md‎
Lines changed: 12 additions & 35 deletions
diff --git a/‎esrgan.hpp‎
Lines changed: 1 addition & 1 deletion b/‎esrgan.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cli/README.md‎
Lines changed: 6 additions & 0 deletions b/‎examples/cli/README.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/cli/main.cpp‎
Lines changed: 26 additions & 1 deletion b/‎examples/cli/main.cpp‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎flux.hpp‎
Lines changed: 1 addition & 1 deletion b/‎flux.hpp‎
Lines changed: 1 addition & 1 deletion
@@ -936,7 +936,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
                                     size_t max_token_idx         = 0,
                                     bool return_pooled           = false,
                                     int clip_skip                = -1) {
-        struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+        struct ggml_cgraph* gf = new_graph_custom(2048);
 
         input_ids = to_backend(input_ids);
 
 
@@ -182,31 +182,21 @@ class GEGLU : public UnaryBlock {
     int64_t dim_in;
     int64_t dim_out;
 
-    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
-        enum ggml_type wtype      = get_type(prefix + "proj.weight", tensor_storage_map, GGML_TYPE_F32);
-        enum ggml_type bias_wtype = GGML_TYPE_F32;
-        params["proj.weight"]     = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
-        params["proj.bias"]       = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
-    }
-
 public:
     GEGLU(int64_t dim_in, int64_t dim_out)
-        : dim_in(dim_in), dim_out(dim_out) {}
+        : dim_in(dim_in), dim_out(dim_out) {
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
+    }
 
     struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
         // x: [ne3, ne2, ne1, dim_in]
         // return: [ne3, ne2, ne1, dim_out]
-        struct ggml_tensor* w = params["proj.weight"];
-        struct ggml_tensor* b = params["proj.bias"];
-
-        auto x_w    = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0);                        // [dim_out, dim_in]
-        auto x_b    = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, 0);                                            // [dim_out, dim_in]
-        auto gate_w = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2);  // [dim_out, ]
-        auto gate_b = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2);                      // [dim_out, ]
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
 
-        auto x_in = x;
-        x         = ggml_ext_linear(ctx->ggml_ctx, x_in, x_w, x_b);        // [ne3, ne2, ne1, dim_out]
-        auto gate = ggml_ext_linear(ctx->ggml_ctx, x_in, gate_w, gate_b);  // [ne3, ne2, ne1, dim_out]
+        x          = proj->forward(ctx, x);  // [ne3, ne2, ne1, dim_out*2]
+        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
+        x          = x_vec[0];  // [ne3, ne2, ne1, dim_out]
+        auto gate  = x_vec[1];  // [ne3, ne2, ne1, dim_out]
 
         gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
 
 
@@ -34,6 +34,7 @@ struct Conditioner {
     virtual void free_params_buffer()                                                      = 0;
     virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors)    = 0;
     virtual size_t get_params_buffer_size()                                                = 0;
+    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {}
     virtual std::tuple<SDCondition, std::vector<bool>> get_learned_condition_with_trigger(ggml_context* work_ctx,
                                                                                           int n_threads,
                                                                                           const ConditionerParams& conditioner_params) {
@@ -108,6 +109,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         return buffer_size;
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        text_model->set_weight_adapter(adapter);
+        if (sd_version_is_sdxl(version)) {
+            text_model2->set_weight_adapter(adapter);
+        }
+    }
+
     bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
         // the order matters
         ModelLoader model_loader;
@@ -764,6 +772,18 @@ struct SD3CLIPEmbedder : public Conditioner {
         return buffer_size;
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (clip_l) {
+            clip_l->set_weight_adapter(adapter);
+        }
+        if (clip_g) {
+            clip_g->set_weight_adapter(adapter);
+        }
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
     std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
                                                                           size_t max_length = 0,
                                                                           bool padding      = false) {
@@ -1160,6 +1180,15 @@ struct FluxCLIPEmbedder : public Conditioner {
         return buffer_size;
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
+        if (clip_l) {
+            clip_l->set_weight_adapter(adapter);
+        }
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
     std::vector<std::pair<std::vector<int>, std::vector<float>>> tokenize(std::string text,
                                                                           size_t max_length = 0,
                                                                           bool padding      = false) {
@@ -1400,6 +1429,12 @@ struct T5CLIPEmbedder : public Conditioner {
         return buffer_size;
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (t5) {
+            t5->set_weight_adapter(adapter);
+        }
+    }
+
     std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
                                                                                   size_t max_length = 0,
                                                                                   bool padding      = false) {
@@ -1589,6 +1624,12 @@ struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
         return buffer_size;
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        if (qwenvl) {
+            qwenvl->set_weight_adapter(adapter);
+        }
+    }
+
     std::tuple<std::vector<int>, std::vector<float>> tokenize(std::string text,
                                                               size_t max_length           = 0,
                                                               size_t system_prompt_length = 0,
 
@@ -380,7 +380,7 @@ struct ControlNet : public GGMLRunner {
                                     struct ggml_tensor* timesteps,
                                     struct ggml_tensor* context,
                                     struct ggml_tensor* y = nullptr) {
-        struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
+        struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
 
         x = to_backend(x);
         if (guided_hint_cached) {
 
@@ -35,8 +35,9 @@ struct DiffusionModel {
     virtual void free_compute_buffer()                                                  = 0;
     virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
     virtual size_t get_params_buffer_size()                                             = 0;
-    virtual int64_t get_adm_in_channels()                                               = 0;
-    virtual void set_flash_attn_enabled(bool enabled)                                   = 0;
+    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
+    virtual int64_t get_adm_in_channels()             = 0;
+    virtual void set_flash_attn_enabled(bool enabled) = 0;
 };
 
 struct UNetModel : public DiffusionModel {
@@ -73,6 +74,10 @@ struct UNetModel : public DiffusionModel {
         return unet.get_params_buffer_size();
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        unet.set_weight_adapter(adapter);
+    }
+
     int64_t get_adm_in_channels() override {
         return unet.unet.adm_in_channels;
     }
@@ -130,6 +135,10 @@ struct MMDiTModel : public DiffusionModel {
         return mmdit.get_params_buffer_size();
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        mmdit.set_weight_adapter(adapter);
+    }
+
     int64_t get_adm_in_channels() override {
         return 768 + 1280;
     }
@@ -188,6 +197,10 @@ struct FluxModel : public DiffusionModel {
         return flux.get_params_buffer_size();
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        flux.set_weight_adapter(adapter);
+    }
+
     int64_t get_adm_in_channels() override {
         return 768;
     }
@@ -251,6 +264,10 @@ struct WanModel : public DiffusionModel {
         return wan.get_params_buffer_size();
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        wan.set_weight_adapter(adapter);
+    }
+
     int64_t get_adm_in_channels() override {
         return 768;
     }
@@ -313,6 +330,10 @@ struct QwenImageModel : public DiffusionModel {
         return qwen_image.get_params_buffer_size();
     }
 
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        qwen_image.set_weight_adapter(adapter);
+    }
+
     int64_t get_adm_in_channels() override {
         return 768;
     }
 
@@ -12,38 +12,15 @@ Here's a simple example:
 
 `../models/marblesh.safetensors` or `../models/marblesh.ckpt` will be applied to the model
 
-# Support matrix
-
-> ℹ️ CUDA `get_rows` support is defined here:  
-> [ggml-org/ggml/src/ggml-cuda/getrows.cu#L156](https://github.com/ggml-org/ggml/blob/7dee1d6a1e7611f238d09be96738388da97c88ed/src/ggml-cuda/getrows.cu#L156)  
-> Currently only the basic types + Q4/Q5/Q8 are implemented. K-quants are **not** supported.
-
-NOTE: The other backends may have different support.
-
-| Quant / Type | CUDA | Vulkan |
-|--------------|------|--------|
-| F32          | ✔️   | ✔️   |
-| F16          | ✔️   | ✔️   |
-| BF16         | ✔️   | ✔️   |
-| I32          | ✔️   | ❌   |
-| Q4_0         | ✔️   | ✔️   |
-| Q4_1         | ✔️   | ✔️   |
-| Q5_0         | ✔️   | ✔️   |
-| Q5_1         | ✔️   | ✔️   |
-| Q8_0         | ✔️   | ✔️   |
-| Q2_K         | ❌   | ❌   |
-| Q3_K         | ❌   | ❌   |
-| Q4_K         | ❌   | ❌   |
-| Q5_K         | ❌   | ❌   |
-| Q6_K         | ❌   | ❌   |
-| Q8_K         | ❌   | ❌   |
-| IQ1_S        | ❌   | ✔️   |
-| IQ1_M        | ❌   | ✔️   |
-| IQ2_XXS      | ❌   | ✔️   |
-| IQ2_XS       | ❌   | ✔️   |
-| IQ2_S        | ❌   | ✔️   |
-| IQ3_XXS      | ❌   | ✔️   |
-| IQ3_S        | ❌   | ✔️   |
-| IQ4_XS       | ❌   | ✔️   |
-| IQ4_NL       | ❌   | ✔️   |
-| MXFP4        | ❌   | ✔️   |
+# Lora Apply Mode
+
+There are two ways to apply LoRA: **immediately** and **at_runtime**. You can specify it using the `--lora-apply-mode` parameter.
+
+By default, the mode is selected automatically:
+
+* If the model weights contain any quantized parameters, the **at_runtime** mode is used;
+* Otherwise, the **immediately** mode is used.
+
+The **immediately** mode may have precision and compatibility issues with quantized parameters, but it usually offers faster inference speed and, in some cases, lower memory usage.
+In contrast, the **at_runtime** mode provides better compatibility and higher precision, but inference may be slower and memory usage may be higher in some cases.
+
@@ -344,7 +344,7 @@ struct ESRGAN : public GGMLRunner {
         if (!rrdb_net)
             return nullptr;
         constexpr int kGraphNodes = 1 << 16;  // 65k
-        struct ggml_cgraph* gf    = ggml_new_graph_custom(compute_ctx, kGraphNodes, /*grads*/ false);
+        struct ggml_cgraph* gf    = new_graph_custom(kGraphNodes);
         x                         = to_backend(x);
 
         auto runner_ctx         = get_context();
 
@@ -99,6 +99,12 @@ Options:
   --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
                                            tcd] (default: euler for Flux/SD3/Wan, euler_a otherwise)
   --prediction                             prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]
+  --lora-apply-mode                        the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. In auto mode, if the model weights
+                                           contain any quantized parameters, the at_runtime mode will be used; otherwise,
+                                           immediately will be used.The immediately mode may have precision and
+                                           compatibility issues with quantized parameters, but it usually offers faster inference
+                                           speed and, in some cases, lower memory usageThe at_runtime mode, on the other
+                                           hand, is exactly the opposite.
   --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default:
                                            discrete
   --skip-layers                            layers to skip for SLG steps (default: [7,8,9])
 
@@ -137,7 +137,8 @@ struct SDParams {
     int chroma_t5_mask_pad   = 1;
     float flow_shift         = INFINITY;
 
-    prediction_t prediction = DEFAULT_PRED;
+    prediction_t prediction           = DEFAULT_PRED;
+    lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
 
     sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0.0f, 0.0f};
     bool force_sdxl_vae_conv_scale       = false;
@@ -209,6 +210,7 @@ void print_params(SDParams params) {
     printf("    high_noise_sample_params:          %s\n", SAFE_STR(high_noise_sample_params_str));
     printf("    moe_boundary:                      %.3f\n", params.moe_boundary);
     printf("    prediction:                        %s\n", sd_prediction_name(params.prediction));
+    printf("    lora_apply_mode:                   %s\n", sd_lora_apply_mode_name(params.lora_apply_mode));
     printf("    flow_shift:                        %.2f\n", params.flow_shift);
     printf("    strength(img2img):                 %.2f\n", params.strength);
     printf("    rng:                               %s\n", sd_rng_type_name(params.rng_type));
@@ -926,6 +928,20 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         return 1;
     };
 
+    auto on_lora_apply_mode_arg = [&](int argc, const char** argv, int index) {
+        if (++index >= argc) {
+            return -1;
+        }
+        const char* arg        = argv[index];
+        params.lora_apply_mode = str_to_lora_apply_mode(arg);
+        if (params.lora_apply_mode == LORA_APPLY_MODE_COUNT) {
+            fprintf(stderr, "error: invalid lora apply model %s\n",
+                    arg);
+            return -1;
+        }
+        return 1;
+    };
+
     auto on_sample_method_arg = [&](int argc, const char** argv, int index) {
         if (++index >= argc) {
             return -1;
@@ -1123,6 +1139,14 @@ void parse_args(int argc, const char** argv, SDParams& params) {
          "--prediction",
          "prediction type override, one of [eps, v, edm_v, sd3_flow, flux_flow]",
          on_prediction_arg},
+        {"",
+         "--lora-apply-mode",
+         "the way to apply LoRA, one of [auto, immediately, at_runtime], default is auto. "
+         "In auto mode, if the model weights contain any quantized parameters, the at_runtime mode will be used; otherwise, immediately will be used."
+         "The immediately mode may have precision and compatibility issues with quantized parameters, "
+         "but it usually offers faster inference speed and, in some cases, lower memory usage"
+         "The at_runtime mode, on the other hand, is exactly the opposite.",
+         on_lora_apply_mode_arg},
         {"",
          "--scheduler",
          "denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple], default: discrete",
@@ -1738,6 +1762,7 @@ int main(int argc, const char* argv[]) {
         params.wtype,
         params.rng_type,
         params.prediction,
+        params.lora_apply_mode,
         params.offload_params_to_cpu,
         params.clip_on_cpu,
         params.control_net_cpu,
 
@@ -1243,7 +1243,7 @@ namespace Flux {
                                         bool increase_ref_index               = false,
                                         std::vector<int> skip_layers          = {}) {
             GGML_ASSERT(x->ne[3] == 1);
-            struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
+            struct ggml_cgraph* gf = new_graph_custom(FLUX_GRAPH_SIZE);
 
             struct ggml_tensor* mod_index_arange = nullptr;
             struct ggml_tensor* dct              = nullptr;  // for chroma radiance