diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
deleted file mode 100644
index 5affda3bc..000000000
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ /dev/null
@@ -1,73 +0,0 @@
-name: 🐞 Bug Report
-description: Report a bug or unexpected behavior
-title: "[Bug] "
-labels: ["bug"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Please use this template and include as many details as possible to help us reproduce and fix the issue.
-  - type: textarea
-    id: commit
-    attributes:
-      label: Git commit
-      description: Which commit are you trying to compile?
-      placeholder: |
-        $git rev-parse HEAD
-        40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
-    validations:
-      required: true
-  - type: input
-    id: os
-    attributes:
-      label: Operating System & Version
-      placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
-    validations:
-      required: true
-  - type: dropdown
-    id: backends
-    attributes:
-        label: GGML backends
-        description: Which GGML backends do you know to be affected?
-        options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
-        multiple: true
-    validations:
-      required: true
-  - type: input
-    id: cmd_arguments
-    attributes:
-      label: Command-line arguments used
-      placeholder: The full command line you ran (with all flags)
-    validations:
-      required: true
-  - type: textarea
-    id: steps_to_reproduce
-    attributes:
-      label: Steps to reproduce
-      placeholder: A step-by-step list of what you did
-    validations:
-      required: true
-  - type: textarea
-    id: expected_behavior
-    attributes:
-      label: What you expected to happen
-      placeholder: Describe the expected behavior or result
-    validations:
-      required: true
-  - type: textarea
-    id: actual_behavior
-    attributes:
-      label: What actually happened
-      placeholder: Describe what you saw instead (errors, logs, crash, etc.)
-    validations:
-      required: true
-  - type: textarea
-    id: logs_and_errors
-    attributes:
-      label: Logs / error messages / stack trace
-      placeholder: Paste complete logs or error output
-  - type: textarea
-    id: additional_info
-    attributes:
-      label: Additional context / environment details
-      placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
deleted file mode 100644
index 243faca4c..000000000
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: 💡 Feature Request
-description: Suggest a new feature or improvement
-title: "[Feature] "
-labels: ["enhancement"]
-body:
-  - type: markdown
-    attributes:
-      value: |
-        Thank you for suggesting an improvement! Please fill in the fields below.
-  - type: input
-    id: summary
-    attributes:
-      label: Feature Summary
-      placeholder: A one-line summary of the feature you’d like
-    validations:
-      required: true
-  - type: textarea
-    id: description
-    attributes:
-      label: Detailed Description
-      placeholder: What problem does this solve? How do you expect it to work?
-    validations:
-      required: true
-  - type: textarea
-    id: alternatives
-    attributes:
-      label: Alternatives you considered
-      placeholder: Any alternative designs or workarounds you tried
-  - type: textarea
-    id: additional_context
-    attributes:
-      label: Additional context
-      placeholder: Any extra information (use cases, related functionalities, constraints)
diff --git a/README.md b/README.md
index 4bb224213..b32d3fa62 100644
--- a/README.md
+++ b/README.md
@@ -4,29 +4,11 @@
 
 # stable-diffusion.cpp
 
-<div align="center">
-<a href="https://trendshift.io/repositories/9714" target="_blank"><img src="https://trendshift.io/api/badge/repositories/9714" alt="leejet%2Fstable-diffusion.cpp | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
-</div>
-
 Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
 
 ***Note that this project is under active development. \
 API and command-line option may change frequently.***
 
-## 🔥Important News
-
-* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**  
-  👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
-
-* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**  
-  👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
-
-* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**  
-  👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
-
-* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**  
-  👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)
-
 ## Features
 
 - Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
@@ -50,22 +32,14 @@ API and command-line option may change frequently.***
   - Latent Consistency Models support (LCM/LCM-LoRA)
   - Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
   - Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
-- Supported backends
-  - CPU (AVX, AVX2 and AVX512 support for x86 architectures)
-  - CUDA
-  - Vulkan
-  - Metal
-  - OpenCL
-  - SYCL
-- Supported weight formats
-  - Pytorch checkpoint (`.ckpt` or `.pth`)
-  - Safetensors (`./safetensors`)
-  - GGUF (`.gguf`)
-- Supported platforms
-    - Linux
-    - Mac OS
-    - Windows
-    - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
+- 16-bit, 32-bit float support
+- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
+- Accelerated memory-efficient CPU inference
+    - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
+- AVX, AVX2 and AVX512 support for x86 architectures
+- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration.
+- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
+    - No need to convert to `.ggml` or `.gguf` anymore!
 - Flash Attention for memory usage optimization
 - Negative prompt
 - [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
@@ -81,45 +55,376 @@ API and command-line option may change frequently.***
     - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
 - Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
 - Embedds generation parameters into png output as webui-compatible text string
+- Supported platforms
+    - Linux
+    - Mac OS
+    - Windows
+    - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
 
-## Quick Start
+## Usage
 
-### Get the sd executable
+For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
+If the built product does not meet your requirements, you can choose to build it manually.
 
-- Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
-- Or build from source by following the [build guide](./docs/build.md)
+### Get the Code
 
-### Download model weights
+```
+git clone --recursive https://github.com/leejet/stable-diffusion.cpp
+cd stable-diffusion.cpp
+```
 
-- download weights(.ckpt or .safetensors or .gguf). For example
+- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
+
+```
+cd stable-diffusion.cpp
+git pull origin master
+git submodule init
+git submodule update
+```
+
+### Download weights
+
+- download original weights(.ckpt or .safetensors). For example
+    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
     - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
+    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
+    - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
+
+    ```shell
+    curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
+    # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
+    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
+    # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
+    ```
+
+### Build
+
+#### Build from scratch
+
+```shell
+mkdir build
+cd build
+cmake ..
+cmake --build . --config Release
+```
+
+##### Using OpenBLAS
+
+```
+cmake .. -DGGML_OPENBLAS=ON
+cmake --build . --config Release
+```
+
+##### Using CUDA
+
+This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
+
+```
+cmake .. -DSD_CUDA=ON
+cmake --build . --config Release
+```
+
+##### Using HipBLAS
+This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
+To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
+
+Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
+
+```
+if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
+if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+cmake --build . --config Release
+```
+
+##### Using MUSA
+
+This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
+
+```bash
+cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build . --config Release
+```
+
+##### Using Metal
+
+Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
+
+```
+cmake .. -DSD_METAL=ON
+cmake --build . --config Release
+```
+
+##### Using Vulkan
+
+Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
 
-    ```sh
-    curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
+```
+cmake .. -DSD_VULKAN=ON
+cmake --build . --config Release
+```
+
+##### Using OpenCL (for Adreno GPU)
+
+Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
+
+To build for Windows ARM please refers to [Windows 11 Arm64
+](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
+
+Building for Android:
+
+  Android NDK:
+       Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
+
+Setup OpenCL Dependencies for NDK:
+
+You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
+
+*   OpenCL Headers:
+    ```bash
+    # In a temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-Headers
+    cd OpenCL-Headers
+    # Replace <YOUR_NDK_PATH> with your actual NDK installation path
+    # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+    cd ..
     ```
 
-### Generate an image with just one command
+*   OpenCL ICD Loader:
+    ```bash
+    # In the same temporary working directory
+    git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+    cd OpenCL-ICD-Loader
+    mkdir build_ndk && cd build_ndk
+
+    # Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
+    cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+      -DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
+      -DANDROID_ABI=arm64-v8a \
+      -DANDROID_PLATFORM=24 \
+      -DANDROID_STL=c++_shared
+
+    ninja
+    # Replace <YOUR_NDK_PATH>
+    # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
+    cd ../..
+    ```
+
+Build `stable-diffusion.cpp` for Android with OpenCL:
+
+```bash
+mkdir build-android && cd build-android
+
+# Replace <YOUR_NDK_PATH> with your actual NDK installation path
+# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
+cmake .. -G Ninja \
+  -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a \
+  -DANDROID_PLATFORM=android-28 \
+  -DGGML_OPENMP=OFF \
+  -DSD_OPENCL=ON
+
+ninja
+```
+*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
+
+##### Using SYCL
+
+Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
+
+```
+# Export relevant ENV variables
+source /opt/intel/oneapi/setvars.sh
+
+# Option 1: Use FP32 (recommended for better performance in most cases)
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+# Option 2: Use FP16
+cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+
+cmake --build . --config Release
+```
+
+Example of text2img by using SYCL backend:
+
+- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
+
+- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler  -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
+
+<p align="center">
+  <img src="http://github.com/assets/sycl_sd3_output.png" width="360x">
+</p>
+
+
+
+##### Using Flash Attention
+
+Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
+eg.:
+ - flux 768x768 ~600mb
+ - SD2 768x768 ~1400mb
+
+For most backends, it slows things down, but for cuda it generally speeds it up too.
+At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
+
+Run by adding `--diffusion-fa` to the arguments and watch for:
+```
+[INFO ] stable-diffusion.cpp:312  - Using flash attention in the diffusion model
+```
+and the compute buffer shrink in the debug log:
+```
+[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
+```
+
+### Run
+
+```
+usage: ./bin/sd [arguments]
+
+arguments:
+  -h, --help                         show this help message and exit
+  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen
+  -t, --threads N                    number of threads to use during computation (default: -1)
+                                     If threads <= 0, then threads will be set to the number of CPU physical cores
+  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
+  -m, --model [MODEL]                path to full model
+  --diffusion-model                  path to the standalone diffusion model
+  --high-noise-diffusion-model       path to the standalone high noise diffusion model
+  --clip_l                           path to the clip-l text encoder
+  --clip_g                           path to the clip-g text encoder
+  --clip_vision                      path to the clip-vision encoder
+  --t5xxl                            path to the t5xxl text encoder
+  --qwen2vl                          path to the qwen2vl text encoder
+  --qwen2vl_vision                   path to the qwen2vl vit
+  --vae [VAE]                        path to vae
+  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
+  --control-net [CONTROL_PATH]       path to control net model
+  --embd-dir [EMBEDDING_PATH]        path to embeddings
+  --upscale-model [ESRGAN_PATH]      path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
+  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
+  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
+                                     If not specified, the default is the type of the weight file
+  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
+  --lora-model-dir [DIR]             lora model directory
+  -i, --init-img [IMAGE]             path to the init image, required by img2img
+  --mask [MASK]                      path to the mask image, required by img2img with mask
+  -i, --end-img [IMAGE]              path to the end image, required by flf2v
+  --control-image [IMAGE]            path to image condition, control net
+  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
+  --control-video [PATH]             path to control video frames, It must be a directory path.
+                                     The video frames inside should be stored as images in lexicographical (character) order
+                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
+  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).
+  -o, --output OUTPUT                path to write result image to (default: ./output.png)
+  -p, --prompt [PROMPT]              the prompt to render
+  -n, --negative-prompt PROMPT       the negative prompt (default: "")
+  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
+  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)
+  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
+  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)
+  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])
+  --skip-layer-start START           SLG enabling point: (default: 0.01)
+  --skip-layer-end END               SLG disabling point: (default: 0.2)
+  --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
+  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
+                                     sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise)
+  --timestep-shift N                 shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
+  --steps  STEPS                     number of sample steps (default: 20)
+  --high-noise-cfg-scale SCALE       (high noise) unconditional guidance scale: (default: 7.0)
+  --high-noise-img-cfg-scale SCALE   (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
+  --high-noise-guidance SCALE        (high noise) distilled guidance scale for models with guidance input (default: 3.5)
+  --high-noise-slg-scale SCALE       (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
+                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
+  --high-noise-eta SCALE             (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)
+  --high-noise-skip-layers LAYERS    (high noise) Layers to skip for SLG steps: (default: [7,8,9])
+  --high-noise-skip-layer-start      (high noise) SLG enabling point: (default: 0.01)
+  --high-noise-skip-layer-end END    (high noise) SLG disabling point: (default: 0.2)
+  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
+  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
+                                     (high noise) sampling method (default: "euler_a")
+  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)
+                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
+  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
+  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
+                                     1.0 corresponds to full destruction of information in init image
+  -H, --height H                     image height, in pixel space (default: 512)
+  -W, --width W                      image width, in pixel space (default: 512)
+  --rng {std_default, cuda}          RNG (default: cuda)
+  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
+  -b, --batch-count COUNT            number of images to generate
+  --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override
+  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
+                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
+  --vae-tiling                       process vae in tiles to reduce memory usage
+  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)
+  --vae-relative-tile-size [X]x[Y]   relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
+  --vae-tile-overlap OVERLAP         tile overlap for vae tiling, in fraction of tile size (default: 0.5)
+  --force-sdxl-vae-conv-scale        force use of conv scale on sdxl vae
+  --vae-on-cpu                       keep vae in cpu (for low vram)
+  --clip-on-cpu                      keep clip in cpu (for low vram)
+  --diffusion-fa                     use flash attention in the diffusion model (for low vram)
+                                     Might lower quality, since it implies converting k and v to f16.
+                                     This might crash if it is not supported by the backend.
+  --diffusion-conv-direct            use Conv2d direct in the diffusion model
+                                     This might crash if it is not supported by the backend.
+  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)
+                                     This might crash if it is not supported by the backend.
+  --control-net-cpu                  keep controlnet in cpu (for low vram)
+  --canny                            apply canny preprocessor (edge detection)
+  --color                            colors the logging tags according to level
+  --chroma-disable-dit-mask          disable dit mask for chroma
+  --chroma-enable-t5-mask            enable t5 mask for chroma
+  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma
+  --video-frames                     video frames (default: 1)
+  --fps                              fps (default: 24)
+  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)
+                                     only enabled if `--high-noise-steps` is set to -1
+  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)
+  --vace-strength                    wan vace strength
+  --photo-maker                      path to PHOTOMAKER model
+  --pm-id-images-dir [DIR]           path to PHOTOMAKER input id images dir
+  --pm-id-embed-path [PATH]          path to PHOTOMAKER v2 id embed
+  --pm-style-strength                strength for keeping PHOTOMAKER input identity (default: 20)
+  -v, --verbose                      print extra info
+```
+
+#### txt2img example
 
 ```sh
-./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
+./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
+# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
+# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
+# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
+# ./bin/sd --diffusion-model  ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
+# ./bin/sd -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
 ```
 
-***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
+Using formats of different precisions will yield results of varying quality.
+
+| f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
+| ----  |----  |----  |----  |----  |----  |----  |
+| ![](./assets/f32.png) |![](./assets/f16.png) |![](./assets/q8_0.png) |![](./assets/q5_0.png) |![](./assets/q5_1.png) |![](./assets/q4_0.png) |![](./assets/q4_1.png) |
+
+#### img2img example
 
-## Performance
+- `./output.png` is the image generated from the above txt2img pipeline
 
-If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
+
+```
+./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
+```
+
+<p align="center">
+  <img src="http://github.com/assets/img2img_output.png" width="256x">
+</p>
 
 ## More Guides
 
-- [SD1.x/SD2.x/SDXL](./docs/sd.md)
-- [SD3/SD3.5](./docs/sd3.md)
-- [Flux-dev/Flux-schnell](./docs/flux.md)
-- [FLUX.1-Kontext-dev](./docs/kontext.md)
-- [Chroma](./docs/chroma.md)
-- [🔥Qwen Image](./docs/qwen_image.md)
-- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
-- [🔥Wan2.1/Wan2.2](./docs/wan.md)
 - [LoRA](./docs/lora.md)
 - [LCM/LCM-LoRA](./docs/lcm.md)
 - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
diff --git a/common.hpp b/common.hpp
index d32167145..12a959d95 100644
--- a/common.hpp
+++ b/common.hpp
@@ -28,7 +28,7 @@ class DownSampleBlock : public GGMLBlock {
         if (vae_downsample) {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
 
-            x = ggml_pad(ctx, x, 1, 1, 0, 0);
+            x = sd_pad(ctx, x, 1, 1, 0, 0, use_circular_pad());
             x = conv->forward(ctx, x);
         } else {
             auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
@@ -325,7 +325,7 @@ class CrossAttention : public GGMLBlock {
         auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
         auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
 
-        x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, false, false, flash_attn);  // [N, n_token, inner_dim]
+        x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, false, false, flash_attn, 1.0f, use_circular_pad());  // [N, n_token, inner_dim]
 
         x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
         return x;
diff --git a/conditioner.hpp b/conditioner.hpp
index 4f9efb8cf..e3b588adc 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -636,6 +636,11 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
         vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
     }
 
+    void set_circular_pad(bool enabled) override {
+        GGMLRunner::set_circular_pad(enabled);
+        vision_model.set_circular_pad(enabled);
+    }
+
     struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values, bool return_pooled, int clip_skip) {
         struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
 
diff --git a/control.hpp b/control.hpp
index 79b82a220..e8a3c31b2 100644
--- a/control.hpp
+++ b/control.hpp
@@ -387,6 +387,11 @@ struct ControlNet : public GGMLRunner {
         control_net.get_param_tensors(tensors, prefix);
     }
 
+    void set_circular_pad(bool enabled) override {
+        GGMLRunner::set_circular_pad(enabled);
+        control_net.set_circular_pad(enabled);
+    }
+
     struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                     struct ggml_tensor* hint,
                                     struct ggml_tensor* timesteps,
@@ -470,4 +475,4 @@ struct ControlNet : public GGMLRunner {
     }
 };
 
-#endif  // __CONTROL_HPP__
\ No newline at end of file
+#endif  // __CONTROL_HPP__
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 6c38b58a9..38ffabc0d 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -36,6 +36,7 @@ struct DiffusionModel {
     virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) = 0;
     virtual size_t get_params_buffer_size()                                             = 0;
     virtual int64_t get_adm_in_channels()                                               = 0;
+    virtual void set_circular_pad(bool enabled) { SD_UNUSED(enabled); }
 };
 
 struct UNetModel : public DiffusionModel {
@@ -77,6 +78,10 @@ struct UNetModel : public DiffusionModel {
         return unet.unet.adm_in_channels;
     }
 
+    void set_circular_pad(bool enabled) override {
+        unet.set_circular_pad(enabled);
+    }
+
     void compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = NULL,
@@ -131,6 +136,10 @@ struct MMDiTModel : public DiffusionModel {
         return 768 + 1280;
     }
 
+    void set_circular_pad(bool enabled) override {
+        mmdit.set_circular_pad(enabled);
+    }
+
     void compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = NULL,
@@ -186,6 +195,10 @@ struct FluxModel : public DiffusionModel {
         return 768;
     }
 
+    void set_circular_pad(bool enabled) override {
+        flux.set_circular_pad(enabled);
+    }
+
     void compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = NULL,
@@ -246,6 +259,10 @@ struct WanModel : public DiffusionModel {
         return 768;
     }
 
+    void set_circular_pad(bool enabled) override {
+        wan.set_circular_pad(enabled);
+    }
+
     void compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = NULL,
@@ -305,6 +322,10 @@ struct QwenImageModel : public DiffusionModel {
         return 768;
     }
 
+    void set_circular_pad(bool enabled) override {
+        qwen_image.set_circular_pad(enabled);
+    }
+
     void compute(int n_threads,
                  DiffusionParams diffusion_params,
                  struct ggml_tensor** output     = NULL,
diff --git a/docs/build.md b/docs/build.md
deleted file mode 100644
index 02889cabb..000000000
--- a/docs/build.md
+++ /dev/null
@@ -1,173 +0,0 @@
-# Build from scratch
-
-## Get the Code
-
-```
-git clone --recursive https://github.com/leejet/stable-diffusion.cpp
-cd stable-diffusion.cpp
-```
-
-- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
-
-```
-cd stable-diffusion.cpp
-git pull origin master
-git submodule init
-git submodule update
-```
-
-## Build (CPU only)
-
-If you don't have a GPU or CUDA installed, you can build a CPU-only version.
-
-```shell
-mkdir build && cd build
-cmake ..
-cmake --build . --config Release
-```
-
-## Build with OpenBLAS
-
-```shell
-mkdir build && cd build
-cmake .. -DGGML_OPENBLAS=ON
-cmake --build . --config Release
-```
-
-## Build with CUDA
-
-This provides GPU acceleration using NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
-
-```shell
-mkdir build && cd build
-cmake .. -DSD_CUDA=ON
-cmake --build . --config Release
-```
-
-## Build with HipBLAS
-
-This provides GPU acceleration using AMD GPU. Make sure to have the ROCm toolkit installed.
-To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
-
-Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
-
-```shell
-mkdir build && cd build
-if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
-if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-cmake --build . --config Release
-```
-
-## Build with MUSA
-
-This provides GPU acceleration using Moore Threads GPU. Make sure to have the MUSA toolkit installed.
-
-```shell
-mkdir build && cd build
-cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
-cmake --build . --config Release
-```
-
-## Build with Metal
-
-Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
-
-```shell
-mkdir build && cd build
-cmake .. -DSD_METAL=ON
-cmake --build . --config Release
-```
-
-## Build with Vulkan
-
-Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
-
-```shell
-mkdir build && cd build
-cmake .. -DSD_VULKAN=ON
-cmake --build . --config Release
-```
-
-## Build with OpenCL (for Adreno GPU)
-
-Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
-
-To build for Windows ARM please refers to [Windows 11 Arm64](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
-
-Building for Android:
-
-  Android NDK:
-       Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
-
-Setup OpenCL Dependencies for NDK:
-
-You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
-
-*   OpenCL Headers:
-    ```bash
-    # In a temporary working directory
-    git clone https://github.com/KhronosGroup/OpenCL-Headers
-    cd OpenCL-Headers
-    # Replace <YOUR_NDK_PATH> with your actual NDK installation path
-    # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
-    sudo cp -r CL <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
-    cd ..
-    ```
-
-*   OpenCL ICD Loader:
-    ```shell
-    # In the same temporary working directory
-    git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
-    cd OpenCL-ICD-Loader
-    mkdir build_ndk && cd build_ndk
-
-    # Replace <YOUR_NDK_PATH> in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
-    cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-      -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-      -DOPENCL_ICD_LOADER_HEADERS_DIR=<YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-      -DANDROID_ABI=arm64-v8a \
-      -DANDROID_PLATFORM=24 \
-      -DANDROID_STL=c++_shared
-
-    ninja
-    # Replace <YOUR_NDK_PATH>
-    # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
-    sudo cp libOpenCL.so <YOUR_NDK_PATH>/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
-    cd ../..
-    ```
-
-Build `stable-diffusion.cpp` for Android with OpenCL:
-
-```shell
-mkdir build-android && cd build-android
-
-# Replace <YOUR_NDK_PATH> with your actual NDK installation path
-# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
-cmake .. -G Ninja \
-  -DCMAKE_TOOLCHAIN_FILE=<YOUR_NDK_PATH>/build/cmake/android.toolchain.cmake \
-  -DANDROID_ABI=arm64-v8a \
-  -DANDROID_PLATFORM=android-28 \
-  -DGGML_OPENMP=OFF \
-  -DSD_OPENCL=ON
-
-ninja
-```
-*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
-
-## Build with SYCL
-
-Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
-
-```shell
-# Export relevant ENV variables
-source /opt/intel/oneapi/setvars.sh
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
-cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-
-cmake --build . --config Release
-```
diff --git a/docs/performance.md b/docs/performance.md
deleted file mode 100644
index 0c4735e0b..000000000
--- a/docs/performance.md
+++ /dev/null
@@ -1,26 +0,0 @@
-## Use Flash Attention to save memory and improve speed.
-
-Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
-eg.:
- - flux 768x768 ~600mb
- - SD2 768x768 ~1400mb
-
-For most backends, it slows things down, but for cuda it generally speeds it up too.
-At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
-
-Run by adding `--diffusion-fa` to the arguments and watch for:
-```
-[INFO ] stable-diffusion.cpp:312  - Using flash attention in the diffusion model
-```
-and the compute buffer shrink in the debug log:
-```
-[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
-```
-
-## Offload weights to the CPU to save VRAM without reducing generation speed.
-
-Using `--offload-to-cpu` allows you to offload weights to the CPU, saving VRAM without reducing generation speed.
-
-## Use quantization to reduce memory usage.
-
-[quantization](./quantization_and_gguf.md)
\ No newline at end of file
diff --git a/docs/sd.md b/docs/sd.md
deleted file mode 100644
index f95c47287..000000000
--- a/docs/sd.md
+++ /dev/null
@@ -1,37 +0,0 @@
-## Download weights
-
-- download original weights(.ckpt or .safetensors). For example
-    - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
-    - Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
-    - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
-    - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
-
-### txt2img example
-
-```sh
-./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
-# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
-# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
-# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
-# ./bin/sd --diffusion-model  ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors  -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
-# ./bin/sd -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
-```
-
-Using formats of different precisions will yield results of varying quality.
-
-| f32  | f16  |q8_0  |q5_0  |q5_1  |q4_0  |q4_1  |
-| ----  |----  |----  |----  |----  |----  |----  |
-| ![](../assets/f32.png) |![](../assets/f16.png) |![](../assets/q8_0.png) |![](../assets/q5_0.png) |![](../assets/q5_1.png) |![](../assets/q4_0.png) |![](../assets/q4_1.png) |
-
-### img2img example
-
-- `./output.png` is the image generated from the above txt2img pipeline
-
-
-```
-./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
-```
-
-<p align="center">
-  <img src="http://github.com/assets/img2img_output.png" width="256x">
-</p>
\ No newline at end of file
diff --git a/examples/cli/README.md b/examples/cli/README.md
deleted file mode 100644
index 6e8ddd48d..000000000
--- a/examples/cli/README.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# Run
-
-```
-usage: ./bin/sd [arguments]
-
-arguments:
-  -h, --help                         show this help message and exit
-  -M, --mode [MODE]                  run mode, one of: [img_gen, vid_gen, upscale, convert], default: img_gen
-  -t, --threads N                    number of threads to use during computation (default: -1)
-                                     If threads <= 0, then threads will be set to the number of CPU physical cores
-  --offload-to-cpu                   place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
-  -m, --model [MODEL]                path to full model
-  --diffusion-model                  path to the standalone diffusion model
-  --high-noise-diffusion-model       path to the standalone high noise diffusion model
-  --clip_l                           path to the clip-l text encoder
-  --clip_g                           path to the clip-g text encoder
-  --clip_vision                      path to the clip-vision encoder
-  --t5xxl                            path to the t5xxl text encoder
-  --qwen2vl                          path to the qwen2vl text encoder
-  --qwen2vl_vision                   path to the qwen2vl vit
-  --vae [VAE]                        path to vae
-  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
-  --control-net [CONTROL_PATH]       path to control net model
-  --embd-dir [EMBEDDING_PATH]        path to embeddings
-  --upscale-model [ESRGAN_PATH]      path to esrgan model. For img_gen mode, upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
-  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)
-  --type [TYPE]                      weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
-                                     If not specified, the default is the type of the weight file
-  --tensor-type-rules [EXPRESSION]   weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
-  --lora-model-dir [DIR]             lora model directory
-  -i, --init-img [IMAGE]             path to the init image, required by img2img
-  --mask [MASK]                      path to the mask image, required by img2img with mask
-  -i, --end-img [IMAGE]              path to the end image, required by flf2v
-  --control-image [IMAGE]            path to image condition, control net
-  -r, --ref-image [PATH]             reference image for Flux Kontext models (can be used multiple times)
-  --control-video [PATH]             path to control video frames, It must be a directory path.
-                                     The video frames inside should be stored as images in lexicographical (character) order
-                                     For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
-  --increase-ref-index               automatically increase the indices of references images based on the order they are listed (starting with 1).
-  -o, --output OUTPUT                path to write result image to (default: ./output.png)
-  -p, --prompt [PROMPT]              the prompt to render
-  -n, --negative-prompt PROMPT       the negative prompt (default: "")
-  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
-  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)
-  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)
-                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
-  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)
-  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])
-  --skip-layer-start START           SLG enabling point: (default: 0.01)
-  --skip-layer-end END               SLG disabling point: (default: 0.2)
-  --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
-  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
-                                     sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise)
-  --timestep-shift N                 shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
-  --steps  STEPS                     number of sample steps (default: 20)
-  --high-noise-cfg-scale SCALE       (high noise) unconditional guidance scale: (default: 7.0)
-  --high-noise-img-cfg-scale SCALE   (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
-  --high-noise-guidance SCALE        (high noise) distilled guidance scale for models with guidance input (default: 3.5)
-  --high-noise-slg-scale SCALE       (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
-                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium
-  --high-noise-eta SCALE             (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)
-  --high-noise-skip-layers LAYERS    (high noise) Layers to skip for SLG steps: (default: [7,8,9])
-  --high-noise-skip-layer-start      (high noise) SLG enabling point: (default: 0.01)
-  --high-noise-skip-layer-end END    (high noise) SLG disabling point: (default: 0.2)
-  --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
-  --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
-                                     (high noise) sampling method (default: "euler_a")
-  --high-noise-steps  STEPS          (high noise) number of sample steps (default: -1 = auto)
-                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
-  --strength STRENGTH                strength for noising/unnoising (default: 0.75)
-  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
-                                     1.0 corresponds to full destruction of information in init image
-  -H, --height H                     image height, in pixel space (default: 512)
-  -W, --width W                      image width, in pixel space (default: 512)
-  --rng {std_default, cuda}          RNG (default: cuda)
-  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
-  -b, --batch-count COUNT            number of images to generate
-  --prediction {eps, v, edm_v, sd3_flow, flux_flow} Prediction type override
-  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
-                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
-  --vae-tiling                       process vae in tiles to reduce memory usage
-  --vae-tile-size [X]x[Y]            tile size for vae tiling (default: 32x32)
-  --vae-relative-tile-size [X]x[Y]   relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
-  --vae-tile-overlap OVERLAP         tile overlap for vae tiling, in fraction of tile size (default: 0.5)
-  --force-sdxl-vae-conv-scale        force use of conv scale on sdxl vae
-  --vae-on-cpu                       keep vae in cpu (for low vram)
-  --clip-on-cpu                      keep clip in cpu (for low vram)
-  --diffusion-fa                     use flash attention in the diffusion model (for low vram)
-                                     Might lower quality, since it implies converting k and v to f16.
-                                     This might crash if it is not supported by the backend.
-  --diffusion-conv-direct            use Conv2d direct in the diffusion model
-                                     This might crash if it is not supported by the backend.
-  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)
-                                     This might crash if it is not supported by the backend.
-  --control-net-cpu                  keep controlnet in cpu (for low vram)
-  --canny                            apply canny preprocessor (edge detection)
-  --color                            colors the logging tags according to level
-  --chroma-disable-dit-mask          disable dit mask for chroma
-  --chroma-enable-t5-mask            enable t5 mask for chroma
-  --chroma-t5-mask-pad  PAD_SIZE     t5 mask pad size of chroma
-  --video-frames                     video frames (default: 1)
-  --fps                              fps (default: 24)
-  --moe-boundary BOUNDARY            timestep boundary for Wan2.2 MoE model. (default: 0.875)
-                                     only enabled if `--high-noise-steps` is set to -1
-  --flow-shift SHIFT                 shift value for Flow models like SD3.x or WAN (default: auto)
-  --vace-strength                    wan vace strength
-  --photo-maker                      path to PHOTOMAKER model
-  --pm-id-images-dir [DIR]           path to PHOTOMAKER input id images dir
-  --pm-id-embed-path [PATH]          path to PHOTOMAKER v2 id embed
-  --pm-style-strength                strength for keeping PHOTOMAKER input identity (default: 20)
-  -v, --verbose                      print extra info
-```
\ No newline at end of file
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index ff36cea25..658704df0 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -113,6 +113,7 @@ struct SDParams {
     bool diffusion_flash_attn  = false;
     bool diffusion_conv_direct = false;
     bool vae_conv_direct       = false;
+    bool pad_circular          = false;
     bool canny_preprocess      = false;
     bool color                 = false;
     int upscale_repeats        = 1;
@@ -183,6 +184,7 @@ void print_params(SDParams params) {
     printf("    diffusion flash attention:         %s\n", params.diffusion_flash_attn ? "true" : "false");
     printf("    diffusion Conv2d direct:           %s\n", params.diffusion_conv_direct ? "true" : "false");
     printf("    vae_conv_direct:                   %s\n", params.vae_conv_direct ? "true" : "false");
+    printf("    pad circular:                      %s\n", params.pad_circular ? "true" : "false");
     printf("    control_strength:                  %.2f\n", params.control_strength);
     printf("    prompt:                            %s\n", params.prompt.c_str());
     printf("    negative_prompt:                   %s\n", params.negative_prompt.c_str());
@@ -304,6 +306,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     This might crash if it is not supported by the backend.\n");
     printf("  --vae-conv-direct                  use Conv2d direct in the vae model (should improve the performance)\n");
     printf("                                     This might crash if it is not supported by the backend.\n");
+    printf("  --pad-circular                     use circular padding instead of zero padding\n");
+    printf("                                     Applies to all model pad operations (may change results).\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            colors the logging tags according to level\n");
@@ -573,6 +577,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         {"", "--diffusion-fa", "", true, &params.diffusion_flash_attn},
         {"", "--diffusion-conv-direct", "", true, &params.diffusion_conv_direct},
         {"", "--vae-conv-direct", "", true, &params.vae_conv_direct},
+        {"", "--pad-circular", "", true, &params.pad_circular},
         {"", "--canny", "", true, &params.canny_preprocess},
         {"-v", "--verbose", "", true, &params.verbose},
         {"", "--color", "", true, &params.color},
@@ -1386,6 +1391,7 @@ int main(int argc, const char* argv[]) {
         params.diffusion_flash_attn,
         params.diffusion_conv_direct,
         params.vae_conv_direct,
+        params.pad_circular,
         params.force_sdxl_vae_conv_scale,
         params.chroma_use_dit_mask,
         params.chroma_use_t5_mask,
diff --git a/flux.hpp b/flux.hpp
index 2ed410419..f8bd887a6 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -128,8 +128,8 @@ namespace Flux {
             // x: [N, n_token, dim]
             // pe: [n_token, d_head/2, 2, 2]
             // return [N, n_token, dim]
-            auto qkv = pre_attention(ctx, x);                                                        // q,k,v: [N, n_token, n_head, d_head]
-            x        = Rope::attention(ctx, backend, qkv[0], qkv[1], qkv[2], pe, mask, flash_attn);  // [N, n_token, dim]
+            auto qkv = pre_attention(ctx, x);  // q,k,v: [N, n_token, n_head, d_head]
+            x        = Rope::attention(ctx, backend, qkv[0], qkv[1], qkv[2], pe, mask, flash_attn, 1.0f, true, use_circular_pad());  // [N, n_token, dim]
             x        = post_attention(ctx, x);                                                       // [N, n_token, dim]
             return x;
         }
@@ -318,7 +318,7 @@ namespace Flux {
             auto k = ggml_concat(ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
             auto v = ggml_concat(ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
 
-            auto attn         = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn);  // [N, n_txt_token + n_img_token, n_head*d_head]
+            auto attn         = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn, 1.0f, true, use_circular_pad());  // [N, n_txt_token + n_img_token, n_head*d_head]
             attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));           // [n_txt_token + n_img_token, N, hidden_size]
             auto txt_attn_out = ggml_view_3d(ctx,
                                              attn,
@@ -453,7 +453,7 @@ namespace Flux {
             auto v           = ggml_reshape_4d(ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);  // [N, n_token, n_head, d_head]
             q                = norm->query_norm(ctx, q);
             k                = norm->key_norm(ctx, k);
-            auto attn        = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn);  // [N, n_token, hidden_size]
+            auto attn        = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn, 1.0f, true, use_circular_pad());  // [N, n_token, hidden_size]
 
             auto attn_mlp = ggml_concat(ctx, attn, ggml_gelu_inplace(ctx, mlp), 0);  // [N, n_token, hidden_size + mlp_hidden_dim]
             auto output   = linear2->forward(ctx, attn_mlp);                         // [N, n_token, hidden_size]
@@ -696,7 +696,7 @@ namespace Flux {
                 vec = approx->forward(ctx, vec);                           // [344, N, hidden_size]
 
                 if (y != NULL) {
-                    txt_img_mask = ggml_pad(ctx, y, img->ne[1], 0, 0, 0);
+                    txt_img_mask = sd_pad(ctx, y, img->ne[1], 0, 0, 0, use_circular_pad());
                 }
             } else {
                 auto time_in   = std::dynamic_pointer_cast<MLPEmbedder>(blocks["time_in"]);
@@ -759,7 +759,7 @@ namespace Flux {
             int64_t patch_size = 2;
             int pad_h          = (patch_size - H % patch_size) % patch_size;
             int pad_w          = (patch_size - W % patch_size) % patch_size;
-            x                  = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            x                  = sd_pad(ctx, x, pad_w, pad_h, 0, 0, use_circular_pad());  // [N, C, H + pad_h, W + pad_w]
 
             // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
             auto img = patchify(ctx, x, patch_size);  // [N, h*w, C * patch_size * patch_size]
@@ -815,9 +815,9 @@ namespace Flux {
                 ggml_tensor* mask    = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
                 ggml_tensor* control = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
 
-                masked  = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0);
-                mask    = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0);
-                control = ggml_pad(ctx, control, pad_w, pad_h, 0, 0);
+                masked  = sd_pad(ctx, masked, pad_w, pad_h, 0, 0, use_circular_pad());
+                mask    = sd_pad(ctx, mask, pad_w, pad_h, 0, 0, use_circular_pad());
+                control = sd_pad(ctx, control, pad_w, pad_h, 0, 0, use_circular_pad());
 
                 masked  = patchify(ctx, masked, patch_size);
                 mask    = patchify(ctx, mask, patch_size);
@@ -827,7 +827,7 @@ namespace Flux {
             } else if (params.version == VERSION_FLUX_CONTROLS) {
                 GGML_ASSERT(c_concat != NULL);
 
-                ggml_tensor* control = ggml_pad(ctx, c_concat, pad_w, pad_h, 0, 0);
+                ggml_tensor* control = sd_pad(ctx, c_concat, pad_w, pad_h, 0, 0, use_circular_pad());
                 control              = patchify(ctx, control, patch_size);
                 img                  = ggml_concat(ctx, img, control, 0);
             }
@@ -925,18 +925,23 @@ namespace Flux {
         }
 
         std::string get_desc() {
-            return "flux";
-        }
+        return "flux";
+    }
 
-        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-            flux.get_param_tensors(tensors, prefix);
-        }
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+        flux.get_param_tensors(tensors, prefix);
+    }
 
-        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
-                                        struct ggml_tensor* timesteps,
-                                        struct ggml_tensor* context,
-                                        struct ggml_tensor* c_concat,
-                                        struct ggml_tensor* y,
+    void set_circular_pad(bool enabled) override {
+        GGMLRunner::set_circular_pad(enabled);
+        flux.set_circular_pad(enabled);
+    }
+
+    struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                    struct ggml_tensor* timesteps,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* c_concat,
+                                    struct ggml_tensor* y,
                                         struct ggml_tensor* guidance,
                                         std::vector<ggml_tensor*> ref_latents = {},
                                         bool increase_ref_index               = false,
diff --git a/ggml b/ggml
index 7bffd79a4..25d358c62 160000
--- a/ggml
+++ b/ggml
@@ -1 +1 @@
-Subproject commit 7bffd79a4bec72e9a3bfbedb582a218b84401c13
+Subproject commit 25d358c627186901b6506ee70faed598613eff05
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index d8df0d8f6..837233e27 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -23,6 +23,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml-cpu.h"
+#include "ggml/src/ggml-impl.h"
 #include "ggml.h"
 
 #include "model.h"
@@ -60,23 +61,36 @@
 #define SD_UNUSED(x) (void)(x)
 #endif
 
-__STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void*) {
-    switch (level) {
-        case GGML_LOG_LEVEL_DEBUG:
-            LOG_DEBUG(text);
-            break;
-        case GGML_LOG_LEVEL_INFO:
-            LOG_INFO(text);
-            break;
-        case GGML_LOG_LEVEL_WARN:
-            LOG_WARN(text);
-            break;
-        case GGML_LOG_LEVEL_ERROR:
-            LOG_ERROR(text);
-            break;
-        default:
-            LOG_DEBUG(text);
+__STATIC_INLINE__ struct ggml_tensor* sd_pad(struct ggml_context* ctx,
+                                             struct ggml_tensor* a,
+                                             int p0,
+                                             int p1,
+                                             int p2,
+                                             int p3,
+                                             bool circular = false) {
+    struct ggml_tensor* result = ggml_pad(ctx, a, p0, p1, p2, p3);
+    if (circular) {
+        ggml_set_pad_mode(result, GGML_PAD_MODE_CIRCULAR);
     }
+    return result;
+}
+
+__STATIC_INLINE__ struct ggml_tensor* sd_pad_ext(struct ggml_context* ctx,
+                                                 struct ggml_tensor* a,
+                                                 int lp0,
+                                                 int rp0,
+                                                 int lp1,
+                                                 int rp1,
+                                                 int lp2,
+                                                 int rp2,
+                                                 int lp3,
+                                                 int rp3,
+                                                 bool circular = false) {
+    struct ggml_tensor* result = ggml_pad_ext(ctx, a, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3);
+    if (circular) {
+        ggml_set_pad_mode(result, GGML_PAD_MODE_CIRCULAR);
+    }
+    return result;
 }
 
 static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
@@ -982,7 +996,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
                                                       int d0      = 1,
                                                       int d1      = 1,
                                                       bool direct = false,
-                                                      float scale = 1.f) {
+                                                      float scale = 1.f,
+                                                      bool circular = false) {
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, scale);
     }
@@ -991,6 +1006,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_conv_2d(struct ggml_context* ctx,
     } else {
         x = ggml_conv_2d(ctx, w, x, s0, s1, p0, p1, d0, d1);
     }
+    ggml_set_op_params_i32(x, 6, circular ? 1 : 0);
     if (scale != 1.f) {
         x = ggml_scale(ctx, x, 1.f / scale);
     }
@@ -1150,7 +1166,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
                                                             bool diag_mask_inf       = false,
                                                             bool skip_reshape        = false,
                                                             bool flash_attn          = false,  // avoid overflow
-                                                            float kv_scale           = 1.0f) {
+                                                            float kv_scale           = 1.0f,
+                                                            bool pad_circular        = false) {
     int64_t L_q;
     int64_t L_k;
     int64_t C;
@@ -1190,7 +1207,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
 
     auto build_kqv = [&](ggml_tensor* q_in, ggml_tensor* k_in, ggml_tensor* v_in, ggml_tensor* mask_in) -> ggml_tensor* {
         if (kv_pad != 0) {
-            k_in = ggml_pad(ctx, k_in, 0, kv_pad, 0, 0);
+            k_in = sd_pad(ctx, k_in, 0, kv_pad, 0, 0, pad_circular);
         }
         if (kv_scale != 1.0f) {
             k_in = ggml_scale(ctx, k_in, kv_scale);
@@ -1200,7 +1217,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
         v_in = ggml_nn_cont(ctx, ggml_permute(ctx, v_in, 0, 2, 1, 3));
         v_in = ggml_reshape_3d(ctx, v_in, d_head, L_k, n_kv_head * N);
         if (kv_pad != 0) {
-            v_in = ggml_pad(ctx, v_in, 0, kv_pad, 0, 0);
+            v_in = sd_pad(ctx, v_in, 0, kv_pad, 0, 0, pad_circular);
         }
         if (kv_scale != 1.0f) {
             v_in = ggml_scale(ctx, v_in, kv_scale);
@@ -1223,7 +1240,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_attention_ext(struct ggml_context*
                 mask_pad = GGML_PAD(L_q, GGML_KQ_MASK_PAD) - mask_in->ne[1];
             }
             if (mask_pad > 0) {
-                mask_in = ggml_pad(ctx, mask_in, 0, mask_pad, 0, 0);
+                mask_in = sd_pad(ctx, mask_in, 0, mask_pad, 0, 0, pad_circular);
             }
             mask_in = ggml_cast(ctx, mask_in, GGML_TYPE_F16);
         }
@@ -1715,6 +1732,10 @@ struct GGMLRunner {
 public:
     virtual std::string get_desc() = 0;
 
+    virtual void set_circular_pad(bool enabled) {
+        SD_UNUSED(enabled);
+    }
+
     GGMLRunner(ggml_backend_t backend, bool offload_params_to_cpu = false)
         : runtime_backend(backend) {
         alloc_params_ctx();
@@ -1864,6 +1885,7 @@ class GGMLBlock {
     typedef std::unordered_map<std::string, std::shared_ptr<GGMLBlock>> GGMLBlockMap;
     GGMLBlockMap blocks;
     ParameterMap params;
+    bool pad_circular_ = false;
 
     ggml_type get_type(const std::string& name, const String2GGMLType& tensor_types, ggml_type default_type) {
         auto iter = tensor_types.find(name);
@@ -1935,6 +1957,24 @@ class GGMLBlock {
         return "GGMLBlock";
     }
 
+    virtual void set_circular_pad(bool enabled) {
+        pad_circular_ = enabled;
+        for (auto& block_iter : blocks) {
+            if (block_iter.second) {
+                block_iter.second->set_circular_pad(enabled);
+            }
+        }
+    }
+
+    bool use_circular_pad() const {
+        return pad_circular_;
+    }
+
+    bool is_circular_pad_enabled() const {
+        return pad_circular_;
+    }
+
+public:
     void get_all_blocks(std::vector<GGMLBlock*>& result) {
         result.push_back(this);
         for (auto& block_iter : blocks) {
@@ -2075,14 +2115,17 @@ class Conv2d : public UnaryBlock {
            std::pair<int, int> stride   = {1, 1},
            std::pair<int, int> padding  = {0, 0},
            std::pair<int, int> dilation = {1, 1},
-           bool bias                    = true)
+           bool bias                    = true,
+           bool circular_pad            = false)
         : in_channels(in_channels),
           out_channels(out_channels),
           kernel_size(kernel_size),
           stride(stride),
           padding(padding),
           dilation(dilation),
-          bias(bias) {}
+          bias(bias) {
+        set_circular_pad(circular_pad);
+    }
 
     void enable_direct() {
         direct = true;
@@ -2102,18 +2145,38 @@ class Conv2d : public UnaryBlock {
         if (bias) {
             b = params["bias"];
         }
-        return ggml_nn_conv_2d(ctx,
+        struct ggml_tensor* input = x;
+        const int pad_w           = padding.second;
+        const int pad_h           = padding.first;
+        const bool circular = use_circular_pad();
+        if (circular && (pad_w != 0 || pad_h != 0)) {
+            input = sd_pad_ext(ctx,
                                x,
+                               pad_w,
+                               pad_w,
+                               pad_h,
+                               pad_h,
+                               0,
+                               0,
+                               0,
+                               0,
+                               circular);
+        }
+        const int conv_pad_w = circular ? 0 : pad_w;
+        const int conv_pad_h = circular ? 0 : pad_h;
+        return ggml_nn_conv_2d(ctx,
+                               input,
                                w,
                                b,
                                stride.second,
                                stride.first,
-                               padding.second,
-                               padding.first,
+                               conv_pad_w,
+                               conv_pad_h,
                                dilation.second,
                                dilation.first,
                                direct,
-                               scale);
+                               scale,
+                               use_circular_pad());
     }
 };
 
@@ -2140,17 +2203,20 @@ class Conv3dnx1x1 : public UnaryBlock {
     Conv3dnx1x1(int64_t in_channels,
                 int64_t out_channels,
                 int64_t kernel_size,
-                int64_t stride   = 1,
-                int64_t padding  = 0,
-                int64_t dilation = 1,
-                bool bias        = true)
+                int64_t stride       = 1,
+                int64_t padding      = 0,
+                int64_t dilation     = 1,
+                bool bias            = true,
+                bool circular_pad    = false)
         : in_channels(in_channels),
           out_channels(out_channels),
           kernel_size(kernel_size),
           stride(stride),
           padding(padding),
           dilation(dilation),
-          bias(bias) {}
+          bias(bias) {
+        set_circular_pad(circular_pad);
+    }
 
     // x: [N, IC, ID, IH*IW]
     // result: [N, OC, OD, OH*OW]
@@ -2160,7 +2226,23 @@ class Conv3dnx1x1 : public UnaryBlock {
         if (bias) {
             b = params["bias"];
         }
-        return ggml_nn_conv_3d_nx1x1(ctx, x, w, b, stride, padding, dilation);
+        struct ggml_tensor* input = x;
+        const bool circular       = use_circular_pad();
+        if (circular && padding != 0) {
+            input = sd_pad_ext(ctx,
+                               x,
+                               0,
+                               0,
+                               0,
+                               0,
+                               padding,
+                               padding,
+                               0,
+                               0,
+                               circular);
+        }
+        const int conv_pad = circular ? 0 : padding;
+        return ggml_nn_conv_3d_nx1x1(ctx, input, w, b, stride, conv_pad, dilation);
     }
 };
 
@@ -2209,9 +2291,30 @@ class Conv3d : public UnaryBlock {
         if (bias) {
             b = params["bias"];
         }
-        return ggml_nn_conv_3d(ctx, x, w, b, in_channels,
+        struct ggml_tensor* input = x;
+        const int pad_d           = std::get<2>(padding);
+        const int pad_h           = std::get<1>(padding);
+        const int pad_w           = std::get<0>(padding);
+        const bool circular       = use_circular_pad();
+        if (circular && (pad_w != 0 || pad_h != 0 || pad_d != 0)) {
+            input = sd_pad_ext(ctx,
+                               x,
+                               pad_w,
+                               pad_w,
+                               pad_h,
+                               pad_h,
+                               pad_d,
+                               pad_d,
+                               0,
+                               0,
+                               circular);
+        }
+        const int conv_pad_w = circular ? 0 : pad_w;
+        const int conv_pad_h = circular ? 0 : pad_h;
+        const int conv_pad_d = circular ? 0 : pad_d;
+        return ggml_nn_conv_3d(ctx, input, w, b, in_channels,
                                std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
-                               std::get<2>(padding), std::get<1>(padding), std::get<0>(padding),
+                               conv_pad_d, conv_pad_h, conv_pad_w,
                                std::get<2>(dilation), std::get<1>(dilation), std::get<0>(dilation));
     }
 };
@@ -2369,7 +2472,7 @@ class MultiheadAttention : public GGMLBlock {
         struct ggml_tensor* k = k_proj->forward(ctx, x);
         struct ggml_tensor* v = v_proj->forward(ctx, x);
 
-        x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, mask);  // [N, n_token, embed_dim]
+        x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, mask, false, false, 1.0f, use_circular_pad());  // [N, n_token, embed_dim]
 
         x = out_proj->forward(ctx, x);  // [N, n_token, embed_dim]
         return x;
diff --git a/mmdit.hpp b/mmdit.hpp
index d9d19340c..55ef11df7 100644
--- a/mmdit.hpp
+++ b/mmdit.hpp
@@ -80,7 +80,7 @@ struct PatchEmbed : public GGMLBlock {
             int64_t H = x->ne[1];
             int pad_h = (patch_size - H % patch_size) % patch_size;
             int pad_w = (patch_size - W % patch_size) % patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // TODO: reflect pad mode
+            x         = sd_pad(ctx, x, pad_w, pad_h, 0, 0, use_circular_pad());  // TODO: reflect pad mode
         }
         x = proj->forward(ctx, x);
 
@@ -208,7 +208,7 @@ class SelfAttention : public GGMLBlock {
                                 ggml_backend_t backend,
                                 struct ggml_tensor* x) {
         auto qkv = pre_attention(ctx, x);
-        x        = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, true);  // [N, n_token, dim]
+        x        = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, true, 1.0f, use_circular_pad());  // [N, n_token, dim]
         x        = post_attention(ctx, x);                                                                            // [N, n_token, dim]
         return x;
     }
@@ -439,8 +439,8 @@ struct DismantledBlock : public GGMLBlock {
             auto qkv2          = std::get<1>(qkv_intermediates);
             auto intermediates = std::get<2>(qkv_intermediates);
 
-            auto attn_out  = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn);     // [N, n_token, dim]
-            auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
+            auto attn_out  = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn, 1.0f, use_circular_pad());     // [N, n_token, dim]
+            auto attn2_out = ggml_nn_attention_ext(ctx, backend, qkv2[0], qkv2[1], qkv2[2], num_heads, NULL, false, false, flash_attn, 1.0f, use_circular_pad());  // [N, n_token, dim]
             x              = post_attention_x(ctx,
                                               attn_out,
                                               attn2_out,
@@ -456,7 +456,7 @@ struct DismantledBlock : public GGMLBlock {
             auto qkv               = qkv_intermediates.first;
             auto intermediates     = qkv_intermediates.second;
 
-            auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
+            auto attn_out = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], num_heads, NULL, false, false, flash_attn, 1.0f, use_circular_pad());  // [N, n_token, dim]
             x             = post_attention(ctx,
                                            attn_out,
                                            intermediates[0],
@@ -502,7 +502,7 @@ block_mixing(struct ggml_context* ctx,
         qkv.push_back(ggml_concat(ctx, context_qkv[i], x_qkv[i], 1));
     }
 
-    auto attn         = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, NULL, false, false, flash_attn);  // [N, n_context + n_token, hidden_size]
+    auto attn         = ggml_nn_attention_ext(ctx, backend, qkv[0], qkv[1], qkv[2], x_block->num_heads, NULL, false, false, flash_attn, 1.0f, x_block->is_circular_pad_enabled());  // [N, n_context + n_token, hidden_size]
     attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                                                              // [n_context + n_token, N, hidden_size]
     auto context_attn = ggml_view_3d(ctx,
                                      attn,
@@ -536,7 +536,7 @@ block_mixing(struct ggml_context* ctx,
     }
 
     if (x_block->self_attn) {
-        auto attn2 = ggml_nn_attention_ext(ctx, backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads);  // [N, n_token, hidden_size]
+        auto attn2 = ggml_nn_attention_ext(ctx, backend, x_qkv2[0], x_qkv2[1], x_qkv2[2], x_block->num_heads, NULL, false, false, false, 1.0f, x_block->is_circular_pad_enabled());  // [N, n_token, hidden_size]
 
         x = x_block->post_attention_x(ctx,
                                       x_attn,
@@ -883,6 +883,11 @@ struct MMDiTRunner : public GGMLRunner {
         mmdit.get_param_tensors(tensors, prefix);
     }
 
+    void set_circular_pad(bool enabled) override {
+        GGMLRunner::set_circular_pad(enabled);
+        mmdit.set_circular_pad(enabled);
+    }
+
     struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                     struct ggml_tensor* timesteps,
                                     struct ggml_tensor* context,
@@ -997,4 +1002,4 @@ struct MMDiTRunner : public GGMLRunner {
     }
 };
 
-#endif
\ No newline at end of file
+#endif
diff --git a/pmid.hpp b/pmid.hpp
index 63029cbc0..0dadf7f4a 100644
--- a/pmid.hpp
+++ b/pmid.hpp
@@ -445,6 +445,12 @@ struct PhotoMakerIDEncoder : public GGMLRunner {
             id_encoder2.get_param_tensors(tensors, prefix);
     }
 
+    void set_circular_pad(bool enabled) override {
+        GGMLRunner::set_circular_pad(enabled);
+        id_encoder.set_circular_pad(enabled);
+        id_encoder2.set_circular_pad(enabled);
+    }
+
     struct ggml_cgraph* build_graph(  // struct ggml_allocr* allocr,
         struct ggml_tensor* id_pixel_values,
         struct ggml_tensor* prompt_embeds,
diff --git a/qwen_image.hpp b/qwen_image.hpp
index ce4e62dce..566d136c8 100644
--- a/qwen_image.hpp
+++ b/qwen_image.hpp
@@ -159,7 +159,7 @@ namespace Qwen {
             auto k = ggml_concat(ctx, txt_k, img_k, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
             auto v = ggml_concat(ctx, txt_v, img_v, 2);  // [N, n_txt_token + n_img_token, n_head, d_head]
 
-            auto attn         = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn, (1.0f / 128.f));  // [N, n_txt_token + n_img_token, n_head*d_head]
+            auto attn         = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn, (1.0f / 128.f), true, use_circular_pad());  // [N, n_txt_token + n_img_token, n_head*d_head]
             attn              = ggml_cont(ctx, ggml_permute(ctx, attn, 0, 2, 1, 3));                           // [n_txt_token + n_img_token, N, hidden_size]
             auto txt_attn_out = ggml_view_3d(ctx,
                                              attn,
@@ -363,7 +363,7 @@ namespace Qwen {
 
             int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
             int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x         = ggml_pad(ctx, x, pad_w, pad_h, 0, 0);  // [N, C, H + pad_h, W + pad_w]
+            x         = sd_pad(ctx, x, pad_w, pad_h, 0, 0, use_circular_pad());  // [N, C, H + pad_h, W + pad_w]
             return x;
         }
 
@@ -547,6 +547,11 @@ namespace Qwen {
             qwen_image.get_param_tensors(tensors, prefix);
         }
 
+        void set_circular_pad(bool enabled) override {
+            GGMLRunner::set_circular_pad(enabled);
+            qwen_image.set_circular_pad(enabled);
+        }
+
         struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                         struct ggml_tensor* timesteps,
                                         struct ggml_tensor* context,
@@ -571,7 +576,8 @@ namespace Qwen {
                                                   ref_latents,
                                                   increase_ref_index,
                                                   qwen_image_params.theta,
-                                                  qwen_image_params.axes_dim);
+                                                  qwen_image_params.axes_dim,
+                                                  qwen_image.is_circular_pad_enabled());
             int pos_len = pe_vec.size() / qwen_image_params.axes_dim_sum / 2;
             // LOG_DEBUG("pos_len %d", pos_len);
             auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, qwen_image_params.axes_dim_sum / 2, pos_len);
@@ -691,4 +697,4 @@ namespace Qwen {
 
 }  // namespace name
 
-#endif  // __QWEN_IMAGE_HPP__
\ No newline at end of file
+#endif  // __QWEN_IMAGE_HPP__
diff --git a/qwenvl.hpp b/qwenvl.hpp
index 881f54d78..8d966637a 100644
--- a/qwenvl.hpp
+++ b/qwenvl.hpp
@@ -524,7 +524,7 @@ namespace Qwen {
             auto k = ggml_reshape_4d(ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);  // [N, n_token, n_head, d_head]
             auto v = ggml_reshape_4d(ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);  // [N, n_token, n_head, d_head]
 
-            x = Rope::attention(ctx, backend, q, k, v, pe, mask, false, 1.f, false);  // [N, n_token, hidden_size]
+            x = Rope::attention(ctx, backend, q, k, v, pe, mask, false, 1.f, false, use_circular_pad());  // [N, n_token, hidden_size]
 
             x = proj->forward(ctx, x);  // [N, n_token, hidden_size]
             return x;
@@ -692,7 +692,7 @@ namespace Qwen {
             k = ggml_cont(ctx, ggml_torch_permute(ctx, k, 0, 2, 1, 3));            // [N, num_kv_heads, n_token, head_dim]
             k = ggml_reshape_3d(ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]);  // [N*num_kv_heads, n_token, head_dim]
 
-            x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, nullptr, true, true, false);  // [N, n_token, hidden_size]
+            x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, nullptr, true, true, false, 1.0f, use_circular_pad());  // [N, n_token, hidden_size]
 
             x = out_proj->forward(ctx, x);  // [N, n_token, hidden_size]
             return x;
diff --git a/rope.hpp b/rope.hpp
index 295c9a217..b594e9ceb 100644
--- a/rope.hpp
+++ b/rope.hpp
@@ -1,6 +1,8 @@
 #ifndef __ROPE_HPP__
 #define __ROPE_HPP__
 
+#include <algorithm>
+#include <cmath>
 #include <vector>
 #include "ggml_extend.hpp"
 
@@ -39,32 +41,51 @@ namespace Rope {
         return flat_vec;
     }
 
-    __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos, int dim, int theta) {
+    __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos,
+                                                           int dim,
+                                                           int theta,
+                                                           const std::vector<int>* wraps = nullptr) {
         assert(dim % 2 == 0);
         int half_dim = dim / 2;
 
+        std::vector<std::vector<float>> result(pos.size(), std::vector<float>(half_dim * 4));
+
         std::vector<float> scale = linspace(0.f, (dim * 1.f - 2) / dim, half_dim);
 
         std::vector<float> omega(half_dim);
         for (int i = 0; i < half_dim; ++i) {
-            omega[i] = 1.0 / std::pow(theta, scale[i]);
-        }
-
-        int pos_size = pos.size();
-        std::vector<std::vector<float>> out(pos_size, std::vector<float>(half_dim));
-        for (int i = 0; i < pos_size; ++i) {
-            for (int j = 0; j < half_dim; ++j) {
-                out[i][j] = pos[i] * omega[j];
-            }
+            omega[i] = 1.0f / std::pow(theta, scale[i]);
         }
 
-        std::vector<std::vector<float>> result(pos_size, std::vector<float>(half_dim * 4));
-        for (int i = 0; i < pos_size; ++i) {
+        for (size_t i = 0; i < pos.size(); ++i) {
+            float position = pos[i];
             for (int j = 0; j < half_dim; ++j) {
-                result[i][4 * j]     = std::cos(out[i][j]);
-                result[i][4 * j + 1] = -std::sin(out[i][j]);
-                result[i][4 * j + 2] = std::sin(out[i][j]);
-                result[i][4 * j + 3] = std::cos(out[i][j]);
+                float omega_val       = omega[j];
+                float original_angle  = position * omega_val;
+                float angle           = original_angle;
+                int wrap              = (wraps != nullptr) ? (*wraps)[i] : 0;
+                if (wrap > 0) {
+                    constexpr float TWO_PI = 6.28318530717958647692f;
+                    float wrap_f            = static_cast<float>(wrap);
+                    float cycles            = omega_val * wrap_f / TWO_PI;
+                    float rounded           = std::round(cycles);  // closest periodic harmonic
+                    float periodic_omega    = TWO_PI * rounded / wrap_f;
+                    float periodic_angle    = position * periodic_omega;
+                    float rel_pos           = std::fmod(position, wrap_f);
+                    if (rel_pos < 0.0f) {
+                        rel_pos += wrap_f;
+                    }
+                    float t       = wrap_f > 0.0f ? rel_pos / wrap_f : 0.0f;
+                    float window  = 0.5f - 0.5f * std::cos(TWO_PI * t);  // 0 at edges, 1 in the middle
+                    window        = std::clamp(window, 0.0f, 1.0f);
+                    angle         = periodic_angle + window * (original_angle - periodic_angle);
+                }
+                float sin_val = std::sin(angle);
+                float cos_val = std::cos(angle);
+                result[i][4 * j]     = cos_val;
+                result[i][4 * j + 1] = -sin_val;
+                result[i][4 * j + 2] = sin_val;
+                result[i][4 * j + 3] = cos_val;
             }
         }
 
@@ -122,7 +143,8 @@ namespace Rope {
     __STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
                                                   int bs,
                                                   int theta,
-                                                  const std::vector<int>& axes_dim) {
+                                                  const std::vector<int>& axes_dim,
+                                                  const std::vector<std::vector<int>>* axes_wraps = nullptr) {
         std::vector<std::vector<float>> trans_ids = transpose(ids);
         size_t pos_len                            = ids.size() / bs;
         int num_axes                              = axes_dim.size();
@@ -137,7 +159,12 @@ namespace Rope {
         std::vector<std::vector<float>> emb(bs * pos_len, std::vector<float>(emb_dim * 2 * 2, 0.0));
         int offset = 0;
         for (int i = 0; i < num_axes; ++i) {
-            std::vector<std::vector<float>> rope_emb = rope(trans_ids[i], axes_dim[i], theta);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
+            const std::vector<int>* axis_wrap = nullptr;
+            if (axes_wraps != nullptr && i < (int)axes_wraps->size()) {
+                axis_wrap = &(*axes_wraps)[i];
+            }
+            std::vector<std::vector<float>> rope_emb =
+                rope(trans_ids[i], axes_dim[i], theta, axis_wrap);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
             for (int b = 0; b < bs; ++b) {
                 for (int j = 0; j < pos_len; ++j) {
                     for (int k = 0; k < rope_emb[0].size(); ++k) {
@@ -250,9 +277,76 @@ namespace Rope {
                                                            const std::vector<ggml_tensor*>& ref_latents,
                                                            bool increase_ref_index,
                                                            int theta,
-                                                           const std::vector<int>& axes_dim) {
+                                                           const std::vector<int>& axes_dim,
+                                                           bool circular = false) {
         std::vector<std::vector<float>> ids = gen_qwen_image_ids(h, w, patch_size, bs, context_len, ref_latents, increase_ref_index);
-        return embed_nd(ids, bs, theta, axes_dim);
+        std::vector<std::vector<int>> axes_wraps;
+        if (circular && bs > 0 && axes_dim.size() >= 3) {
+            int pad_h = (patch_size - (h % patch_size)) % patch_size;
+            int pad_w = (patch_size - (w % patch_size)) % patch_size;
+            int h_len = (h + pad_h) / patch_size;
+            int w_len = (w + pad_w) / patch_size;
+            if (h_len > 0 && w_len > 0) {
+                const size_t total_tokens     = ids.size();
+                const size_t tokens_per_batch = total_tokens / bs;
+                const size_t img_tokens       = static_cast<size_t>(h_len) * static_cast<size_t>(w_len);
+                // Track per-token wrap lengths for the row/column axes so only spatial tokens become periodic.
+                axes_wraps.assign(axes_dim.size(), std::vector<int>(total_tokens, 0));
+
+                std::vector<int> ref_h_lens;
+                std::vector<int> ref_w_lens;
+                ref_h_lens.reserve(ref_latents.size());
+                ref_w_lens.reserve(ref_latents.size());
+                std::vector<size_t> ref_token_counts;
+                ref_token_counts.reserve(ref_latents.size());
+                for (ggml_tensor* ref : ref_latents) {
+                    if (ref == nullptr) {
+                        ref_h_lens.push_back(0);
+                        ref_w_lens.push_back(0);
+                        ref_token_counts.push_back(0);
+                        continue;
+                    }
+                    int ref_h      = static_cast<int>(ref->ne[1]);
+                    int ref_w      = static_cast<int>(ref->ne[0]);
+                    int ref_pad_h  = (patch_size - (ref_h % patch_size)) % patch_size;
+                    int ref_pad_w  = (patch_size - (ref_w % patch_size)) % patch_size;
+                    int ref_h_len  = (ref_h + ref_pad_h) / patch_size;
+                    int ref_w_len  = (ref_w + ref_pad_w) / patch_size;
+                    size_t tokens  = static_cast<size_t>(ref_h_len) * static_cast<size_t>(ref_w_len);
+                    ref_h_lens.push_back(ref_h_len);
+                    ref_w_lens.push_back(ref_w_len);
+                    ref_token_counts.push_back(tokens);
+                }
+
+                for (int b = 0; b < bs; ++b) {
+                    size_t batch_offset = static_cast<size_t>(b) * tokens_per_batch;
+                    size_t cursor       = batch_offset;
+
+                    cursor += static_cast<size_t>(context_len);
+
+                    for (size_t idx = 0; idx < img_tokens && (cursor + idx) < total_tokens; ++idx) {
+                        size_t token_index = cursor + idx;
+                        axes_wraps[1][token_index] = h_len;
+                        axes_wraps[2][token_index] = w_len;
+                    }
+                    cursor += img_tokens;
+
+                    for (size_t r = 0; r < ref_latents.size(); ++r) {
+                        size_t ref_tokens = ref_token_counts[r];
+                        int ref_h_len     = ref_h_lens[r];
+                        int ref_w_len     = ref_w_lens[r];
+                        for (size_t idx = 0; idx < ref_tokens && (cursor + idx) < total_tokens; ++idx) {
+                            size_t token_index = cursor + idx;
+                            axes_wraps[1][token_index] = ref_h_len;
+                            axes_wraps[2][token_index] = ref_w_len;
+                        }
+                        cursor += ref_tokens;
+                    }
+                }
+            }
+        }
+        const std::vector<std::vector<int>>* wraps_ptr = axes_wraps.empty() ? nullptr : &axes_wraps;
+        return embed_nd(ids, bs, theta, axes_dim, wraps_ptr);
     }
 
     __STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t,
@@ -395,14 +489,15 @@ namespace Rope {
                                                     struct ggml_tensor* mask,
                                                     bool flash_attn,
                                                     float kv_scale        = 1.0f,
-                                                    bool rope_interleaved = true) {
+                                                    bool rope_interleaved = true,
+                                                    bool pad_circular     = false) {
         // q,k,v: [N, L, n_head, d_head]
         // pe: [L, d_head/2, 2, 2]
         // return: [N, L, n_head*d_head]
         q = apply_rope(ctx, q, pe, rope_interleaved);  // [N*n_head, L, d_head]
         k = apply_rope(ctx, k, pe, rope_interleaved);  // [N*n_head, L, d_head]
 
-        auto x = ggml_nn_attention_ext(ctx, backend, q, k, v, v->ne[1], mask, false, true, flash_attn, kv_scale);  // [N, L, n_head*d_head]
+        auto x = ggml_nn_attention_ext(ctx, backend, q, k, v, v->ne[1], mask, false, true, flash_attn, kv_scale, pad_circular);  // [N, L, n_head*d_head]
         return x;
     }
 };  // namespace Rope
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 87b6a3779..ecf0b602f 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -115,6 +115,7 @@ class StableDiffusionGGML {
     sd_tiling_params_t vae_tiling_params = {false, 0, 0, 0.5f, 0, 0};
     bool offload_params_to_cpu           = false;
     bool stacked_id                      = false;
+    bool use_circular_pad                = false;
 
     bool is_using_v_parameterization     = false;
     bool is_using_edm_v_parameterization = false;
@@ -187,6 +188,7 @@ class StableDiffusionGGML {
         taesd_path              = SAFE_STR(sd_ctx_params->taesd_path);
         use_tiny_autoencoder    = taesd_path.size() > 0;
         offload_params_to_cpu   = sd_ctx_params->offload_params_to_cpu;
+        use_circular_pad        = sd_ctx_params->pad_circular;
 
         if (sd_ctx_params->rng_type == STD_DEFAULT_RNG) {
             rng = std::make_shared<STDDefaultRNG>();
@@ -422,6 +424,7 @@ class StableDiffusionGGML {
                     clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend,
                                                                              offload_params_to_cpu,
                                                                              model_loader.tensor_storages_types);
+                    clip_vision->set_circular_pad(use_circular_pad);
                     clip_vision->alloc_params_buffer();
                     clip_vision->get_param_tensors(tensors);
                 }
@@ -470,6 +473,13 @@ class StableDiffusionGGML {
             cond_stage_model->alloc_params_buffer();
             cond_stage_model->get_param_tensors(tensors);
 
+            if (diffusion_model) {
+                diffusion_model->set_circular_pad(use_circular_pad);
+            }
+            if (high_noise_diffusion_model) {
+                high_noise_diffusion_model->set_circular_pad(use_circular_pad);
+            }
+
             diffusion_model->alloc_params_buffer();
             diffusion_model->get_param_tensors(tensors);
 
@@ -535,6 +545,13 @@ class StableDiffusionGGML {
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
+            if (first_stage_model) {
+                first_stage_model->set_circular_pad(use_circular_pad);
+            }
+            if (tae_first_stage) {
+                tae_first_stage->set_circular_pad(use_circular_pad);
+            }
+
             if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
                 ggml_backend_t controlnet_backend = NULL;
                 if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
@@ -551,6 +568,7 @@ class StableDiffusionGGML {
                     LOG_INFO("Using Conv2d direct in the control net");
                     control_net->enable_conv2d_direct();
                 }
+                control_net->set_circular_pad(use_circular_pad);
             }
 
             if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
@@ -568,6 +586,9 @@ class StableDiffusionGGML {
                                                                    "pmid",
                                                                    version);
             }
+            if (pmid_model) {
+                pmid_model->set_circular_pad(use_circular_pad);
+            }
             if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
                 pmid_lora = std::make_shared<LoraModel>(backend, sd_ctx_params->photo_maker_path, "");
                 if (!pmid_lora->load_from_file(true, n_threads)) {
@@ -1820,6 +1841,7 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->keep_control_net_on_cpu = false;
     sd_ctx_params->keep_vae_on_cpu         = false;
     sd_ctx_params->diffusion_flash_attn    = false;
+    sd_ctx_params->pad_circular            = false;
     sd_ctx_params->chroma_use_dit_mask     = true;
     sd_ctx_params->chroma_use_t5_mask      = false;
     sd_ctx_params->chroma_t5_mask_pad      = 1;
@@ -1860,6 +1882,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "keep_control_net_on_cpu: %s\n"
              "keep_vae_on_cpu: %s\n"
              "diffusion_flash_attn: %s\n"
+             "pad_circular: %s\n"
              "chroma_use_dit_mask: %s\n"
              "chroma_use_t5_mask: %s\n"
              "chroma_t5_mask_pad: %d\n",
@@ -1889,6 +1912,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
              BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
+             BOOL_STR(sd_ctx_params->pad_circular),
              BOOL_STR(sd_ctx_params->chroma_use_dit_mask),
              BOOL_STR(sd_ctx_params->chroma_use_t5_mask),
              sd_ctx_params->chroma_t5_mask_pad);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index a891a58f1..e521cfccc 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -164,6 +164,7 @@ typedef struct {
     bool diffusion_flash_attn;
     bool diffusion_conv_direct;
     bool vae_conv_direct;
+    bool pad_circular;
     bool force_sdxl_vae_conv_scale;
     bool chroma_use_dit_mask;
     bool chroma_use_t5_mask;
diff --git a/t5.hpp b/t5.hpp
index 15f7af80b..97d81c448 100644
--- a/t5.hpp
+++ b/t5.hpp
@@ -611,7 +611,7 @@ class T5Attention : public GGMLBlock {
 
         k = ggml_scale_inplace(ctx, k, sqrt(d_head));
 
-        x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
+        x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, mask, false, false, false, 1.0f, use_circular_pad());  // [N, n_token, d_head * n_head]
 
         x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
         return {x, past_bias};
@@ -1040,4 +1040,4 @@ struct T5Embedder {
     }
 };
 
-#endif  // __T5_HPP__
\ No newline at end of file
+#endif  // __T5_HPP__
diff --git a/tae.hpp b/tae.hpp
index 41bcbe2f1..2be1336a0 100644
--- a/tae.hpp
+++ b/tae.hpp
@@ -222,6 +222,10 @@ struct TinyAutoEncoder : public GGMLRunner {
         return "taesd";
     }
 
+    void set_circular_pad(bool enabled) override {
+        taesd.set_circular_pad(enabled);
+    }
+
     bool load_from_file(const std::string& file_path, int n_threads) {
         LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false");
         alloc_params_buffer();
@@ -270,4 +274,4 @@ struct TinyAutoEncoder : public GGMLRunner {
     }
 };
 
-#endif  // __TAE_HPP__
\ No newline at end of file
+#endif  // __TAE_HPP__
diff --git a/unet.hpp b/unet.hpp
index 19bedb32b..ea9fab1ec 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -570,6 +570,11 @@ struct UNetModelRunner : public GGMLRunner {
         unet.get_param_tensors(tensors, prefix);
     }
 
+    void set_circular_pad(bool enabled) override {
+        GGMLRunner::set_circular_pad(enabled);
+        unet.set_circular_pad(enabled);
+    }
+
     struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                     struct ggml_tensor* timesteps,
                                     struct ggml_tensor* context,
diff --git a/vae.hpp b/vae.hpp
index 20d97a2ad..5f486699d 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -124,13 +124,16 @@ class AE3DConv : public Conv2d {
              std::pair<int, int> padding  = {0, 0},
              std::pair<int, int> dilation = {1, 1},
              bool bias                    = true)
-        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias) {
+        : Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias, true) {
         int64_t kernel_padding  = video_kernel_size / 2;
         blocks["time_mix_conv"] = std::shared_ptr<GGMLBlock>(new Conv3dnx1x1(out_channels,
                                                                              out_channels,
                                                                              video_kernel_size,
                                                                              1,
-                                                                             kernel_padding));
+                                                                             kernel_padding,
+                                                                             1,
+                                                                             true,
+                                                                             true));
     }
 
     struct ggml_tensor* forward(struct ggml_context* ctx,
@@ -531,6 +534,7 @@ struct VAE : public GGMLRunner {
     virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
     virtual void enable_conv2d_direct(){};
     virtual void set_conv2d_scale(float scale) { SD_UNUSED(scale); };
+    virtual void set_circular_pad(bool enabled) override { SD_UNUSED(enabled); }
 };
 
 struct AutoEncoderKL : public VAE {
@@ -570,6 +574,10 @@ struct AutoEncoderKL : public VAE {
         }
     }
 
+    void set_circular_pad(bool enabled) override {
+        ae.set_circular_pad(enabled);
+    }
+
     std::string get_desc() {
         return "vae";
     }
diff --git a/wan.hpp b/wan.hpp
index 31fa90b3a..e153ed82d 100644
--- a/wan.hpp
+++ b/wan.hpp
@@ -73,7 +73,7 @@ namespace WAN {
                 lp2 -= (int)cache_x->ne[2];
             }
 
-            x = ggml_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0);
+            x = sd_pad_ext(ctx, x, lp0, rp0, lp1, rp1, lp2, rp2, 0, 0, use_circular_pad());
             return ggml_nn_conv_3d(ctx, x, w, b, in_channels,
                                    std::get<2>(stride), std::get<1>(stride), std::get<0>(stride),
                                    0, 0, 0,
@@ -172,7 +172,7 @@ namespace WAN {
                                                   2);
                         }
                         if (chunk_idx == 1 && cache_x->ne[2] < 2) {  // Rep
-                            cache_x = ggml_pad_ext(ctx, cache_x, 0, 0, 0, 0, (int)cache_x->ne[2], 0, 0, 0);
+                            cache_x = sd_pad_ext(ctx, cache_x, 0, 0, 0, 0, (int)cache_x->ne[2], 0, 0, 0, use_circular_pad());
                             // aka cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device),cache_x],dim=2)
                         }
                         if (chunk_idx == 1) {
@@ -198,9 +198,9 @@ namespace WAN {
                 } else if (mode == "upsample3d") {
                     x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST);
                 } else if (mode == "downsample2d") {
-                    x = ggml_pad(ctx, x, 1, 1, 0, 0);
+                    x = sd_pad(ctx, x, 1, 1, 0, 0, use_circular_pad());
                 } else if (mode == "downsample3d") {
-                    x = ggml_pad(ctx, x, 1, 1, 0, 0);
+                    x = sd_pad(ctx, x, 1, 1, 0, 0, use_circular_pad());
                 }
                 x = resample_1->forward(ctx, x);
                 x = ggml_nn_cont(ctx, ggml_torch_permute(ctx, x, 0, 1, 3, 2));  // (c, t, h, w)
@@ -260,7 +260,7 @@ namespace WAN {
 
             int64_t pad_t = (factor_t - T % factor_t) % factor_t;
 
-            x = ggml_pad_ext(ctx, x, 0, 0, 0, 0, pad_t, 0, 0, 0);
+            x = sd_pad_ext(ctx, x, 0, 0, 0, 0, pad_t, 0, 0, 0, use_circular_pad());
             T = x->ne[2];
 
             x = ggml_reshape_4d(ctx, x, W * H, factor_t, T / factor_t, C);                                                  // [C, T/factor_t, factor_t, H*W]
@@ -1120,6 +1120,10 @@ namespace WAN {
             return "wan_vae";
         }
 
+        void set_circular_pad(bool enabled) override {
+            ae.set_circular_pad(enabled);
+        }
+
         void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
             ae.get_param_tensors(tensors, prefix);
         }
@@ -1333,7 +1337,7 @@ namespace WAN {
             k = ggml_reshape_4d(ctx, k, head_dim, num_heads, n_token, N);  // [N, n_token, n_head, d_head]
             v = ggml_reshape_4d(ctx, v, head_dim, num_heads, n_token, N);  // [N, n_token, n_head, d_head]
 
-            x = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn);  // [N, n_token, dim]
+            x = Rope::attention(ctx, backend, q, k, v, pe, mask, flash_attn, 1.0f, true, use_circular_pad());  // [N, n_token, dim]
 
             x = o_proj->forward(ctx, x);  // [N, n_token, dim]
             return x;
@@ -1388,7 +1392,7 @@ namespace WAN {
             k      = norm_k->forward(ctx, k);
             auto v = v_proj->forward(ctx, context);  // [N, n_context, dim]
 
-            x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
+            x = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, NULL, false, false, flash_attn, 1.0f, use_circular_pad());  // [N, n_token, dim]
 
             x = o_proj->forward(ctx, x);  // [N, n_token, dim]
             return x;
@@ -1455,8 +1459,8 @@ namespace WAN {
             k_img      = norm_k_img->forward(ctx, k_img);
             auto v_img = v_img_proj->forward(ctx, context_img);  // [N, context_img_len, dim]
 
-            auto img_x = ggml_nn_attention_ext(ctx, backend, q, k_img, v_img, num_heads, NULL, false, false, flash_attn);  // [N, n_token, dim]
-            x          = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, NULL, false, false, flash_attn);          // [N, n_token, dim]
+            auto img_x = ggml_nn_attention_ext(ctx, backend, q, k_img, v_img, num_heads, NULL, false, false, flash_attn, 1.0f, use_circular_pad());  // [N, n_token, dim]
+            x          = ggml_nn_attention_ext(ctx, backend, q, k, v, num_heads, NULL, false, false, flash_attn, 1.0f, use_circular_pad());          // [N, n_token, dim]
 
             x = ggml_add(ctx, x, img_x);
 
@@ -1838,7 +1842,7 @@ namespace WAN {
             int pad_t = (std::get<0>(params.patch_size) - T % std::get<0>(params.patch_size)) % std::get<0>(params.patch_size);
             int pad_h = (std::get<1>(params.patch_size) - H % std::get<1>(params.patch_size)) % std::get<1>(params.patch_size);
             int pad_w = (std::get<2>(params.patch_size) - W % std::get<2>(params.patch_size)) % std::get<2>(params.patch_size);
-            x         = ggml_pad(ctx, x, pad_w, pad_h, pad_t, 0);  // [N*C, T + pad_t, H + pad_h, W + pad_w]
+            x         = sd_pad(ctx, x, pad_w, pad_h, pad_t, 0, use_circular_pad());  // [N*C, T + pad_t, H + pad_h, W + pad_w]
 
             return x;
         }
@@ -2142,6 +2146,11 @@ namespace WAN {
             wan.get_param_tensors(tensors, prefix);
         }
 
+        void set_circular_pad(bool enabled) override {
+            GGMLRunner::set_circular_pad(enabled);
+            wan.set_circular_pad(enabled);
+        }
+
         struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                         struct ggml_tensor* timesteps,
                                         struct ggml_tensor* context,