diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000..63924a0ba
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,10 @@
+Checks: >
+ modernize-make-shared,
+ modernize-use-nullptr,
+ modernize-use-override,
+ modernize-pass-by-value,
+ modernize-return-braced-init-list,
+ modernize-deprecated-headers,
+HeaderFilterRegex: '^$'
+WarningsAsErrors: ''
+FormatStyle: none
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 000000000..5affda3bc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,73 @@
+name: 🐞 Bug Report
+description: Report a bug or unexpected behavior
+title: "[Bug] "
+labels: ["bug"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Please use this template and include as many details as possible to help us reproduce and fix the issue.
+ - type: textarea
+ id: commit
+ attributes:
+ label: Git commit
+ description: Which commit are you trying to compile?
+ placeholder: |
+ $git rev-parse HEAD
+ 40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
+ validations:
+ required: true
+ - type: input
+ id: os
+ attributes:
+ label: Operating System & Version
+ placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
+ validations:
+ required: true
+ - type: dropdown
+ id: backends
+ attributes:
+ label: GGML backends
+ description: Which GGML backends do you know to be affected?
+ options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
+ multiple: true
+ validations:
+ required: true
+ - type: input
+ id: cmd_arguments
+ attributes:
+ label: Command-line arguments used
+ placeholder: The full command line you ran (with all flags)
+ validations:
+ required: true
+ - type: textarea
+ id: steps_to_reproduce
+ attributes:
+ label: Steps to reproduce
+ placeholder: A step-by-step list of what you did
+ validations:
+ required: true
+ - type: textarea
+ id: expected_behavior
+ attributes:
+ label: What you expected to happen
+ placeholder: Describe the expected behavior or result
+ validations:
+ required: true
+ - type: textarea
+ id: actual_behavior
+ attributes:
+ label: What actually happened
+ placeholder: Describe what you saw instead (errors, logs, crash, etc.)
+ validations:
+ required: true
+ - type: textarea
+ id: logs_and_errors
+ attributes:
+ label: Logs / error messages / stack trace
+ placeholder: Paste complete logs or error output
+ - type: textarea
+ id: additional_info
+ attributes:
+ label: Additional context / environment details
+ placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
new file mode 100644
index 000000000..243faca4c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,33 @@
+name: 💡 Feature Request
+description: Suggest a new feature or improvement
+title: "[Feature] "
+labels: ["enhancement"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thank you for suggesting an improvement! Please fill in the fields below.
+ - type: input
+ id: summary
+ attributes:
+ label: Feature Summary
+ placeholder: A one-line summary of the feature you’d like
+ validations:
+ required: true
+ - type: textarea
+ id: description
+ attributes:
+ label: Detailed Description
+ placeholder: What problem does this solve? How do you expect it to work?
+ validations:
+ required: true
+ - type: textarea
+ id: alternatives
+ attributes:
+ label: Alternatives you considered
+ placeholder: Any alternative designs or workarounds you tried
+ - type: textarea
+ id: additional_context
+ attributes:
+ label: Additional context
+ placeholder: Any extra information (use cases, related functionalities, constraints)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f6cf11ed0..8dd7ce19c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -65,7 +65,7 @@ jobs:
- name: Get commit hash
id: commit
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info
@@ -118,7 +118,7 @@ jobs:
- name: Get commit hash
id: commit
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info
@@ -149,7 +149,7 @@ jobs:
runs-on: windows-2025
env:
- VULKAN_VERSION: 1.3.261.1
+ VULKAN_VERSION: 1.4.328.1
strategy:
matrix:
@@ -164,8 +164,6 @@ jobs:
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12"
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;86;80;75"
- # - build: "rocm5.5"
- # defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
- build: 'vulkan'
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
steps:
@@ -184,24 +182,11 @@ jobs:
method: "network"
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- - name: Install rocm-toolkit
- id: rocm-toolkit
- if: ${{ matrix.build == 'rocm5.5' }}
- uses: Cyberhan123/rocm-toolkit@v0.1.0
- with:
- rocm: "5.5.0"
-
- - name: Install Ninja
- id: install-ninja
- if: ${{ matrix.build == 'rocm5.5' }}
- uses: urkle/action-get-ninja@v1
- with:
- version: 1.11.1
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'vulkan' }}
run: |
- curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/$%7Benv:VULKAN_VERSION%7D/windows/VulkanSDK-$%7Benv:VULKAN_VERSION%7D-Installer.exe"
+ curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/$%7Benv:VULKAN_VERSION%7D/windows/vulkansdk-windows-X64-$%7Benv:VULKAN_VERSION%7D.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
@@ -277,6 +262,104 @@ jobs:
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
+ windows-latest-cmake-hip:
+ runs-on: windows-2022
+
+ env:
+ HIPSDK_INSTALLER_VERSION: "25.Q3"
+ GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+
+ steps:
+ - uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Cache ROCm Installation
+ id: cache-rocm
+ uses: actions/cache@v4
+ with:
+ path: C:\Program Files\AMD\ROCm
+ key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.16
+ with:
+ key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
+ evict-old-files: 1d
+
+ - name: Install ROCm
+ if: steps.cache-rocm.outputs.cache-hit != 'true'
+ run: |
+ $ErrorActionPreference = "Stop"
+ write-host "Downloading AMD HIP SDK Installer"
+ Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-$%7B%7B%20env.HIPSDK_INSTALLER_VERSION%20%7D%7D-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+ write-host "Installing AMD HIP SDK"
+ $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
+ $completed = $proc.WaitForExit(600000)
+ if (-not $completed) {
+ Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
+ $proc.Kill()
+ exit 1
+ }
+ if ($proc.ExitCode -ne 0) {
+ Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
+ exit 1
+ }
+ write-host "Completed AMD HIP SDK installation"
+
+ - name: Verify ROCm
+ run: |
+ # Find and test ROCm installation
+ $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+ if (-not $clangPath) {
+ Write-Error "ROCm installation not found"
+ exit 1
+ }
+ & $clangPath.FullName --version
+ # Set HIP_PATH environment variable for later steps
+ echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
+
+ - name: Build
+ run: |
+ mkdir build
+ cd build
+ $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+ cmake .. `
+ -G "Unix Makefiles" `
+ -DSD_HIPBLAS=ON `
+ -DSD_BUILD_SHARED_LIBS=ON `
+ -DGGML_NATIVE=OFF `
+ -DCMAKE_C_COMPILER=clang `
+ -DCMAKE_CXX_COMPILER=clang++ `
+ -DCMAKE_BUILD_TYPE=Release `
+ -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
+ cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
+
+ - name: Get commit hash
+ id: commit
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: pr-mpt/actions-commit-hash@v2
+
+ - name: Pack artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ md "build\bin\rocblas\library\"
+ md "build\bin\hipblaslt\library"
+ cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+ cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
+ cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+ cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+ cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
+ 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+ path: |
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -286,6 +369,7 @@ jobs:
- ubuntu-latest-cmake
- macOS-latest-cmake
- windows-latest-cmake
+ - windows-latest-cmake-hip
steps:
- name: Clone
diff --git a/.gitignore b/.gitignore
index dd4f6435a..b0e3af83f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@ test/
output*.png
models*
*.log
+preview.png
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0735e5b1..7dc36f0fd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ option(SD_SYCL "sd: sycl backend" OFF)
option(SD_MUSA "sd: musa backend" OFF)
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
+option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON)
@@ -86,18 +87,21 @@ file(GLOB SD_LIB_SOURCES
"*.hpp"
)
-# we can get only one share lib
if(SD_BUILD_SHARED_LIBS)
message("-- Build shared library")
message(${SD_LIB_SOURCES})
- set(BUILD_SHARED_LIBS OFF)
+ if(NOT SD_BUILD_SHARED_GGML_LIB)
+ set(BUILD_SHARED_LIBS OFF)
+ endif()
add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
add_definitions(-DSD_BUILD_SHARED_LIB)
target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
else()
message("-- Build static library")
- set(BUILD_SHARED_LIBS OFF)
+ if(NOT SD_BUILD_SHARED_GGML_LIB)
+ set(BUILD_SHARED_LIBS OFF)
+ endif()
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
endif()
diff --git a/Dockerfile b/Dockerfile
index bd9a378f0..417335793 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,16 +1,21 @@
ARG UBUNTU_VERSION=22.04
-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build
-RUN apt-get update && apt-get install -y build-essential git cmake
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
WORKDIR /sd.cpp
COPY . .
-RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
+RUN cmake . -B ./build
+RUN cmake --build ./build --config Release --parallel
-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+
+RUN apt-get update && \
+ apt-get install --yes --no-install-recommends libgomp1 && \
+ apt-get clean
COPY --from=build /sd.cpp/build/bin/sd /sd
diff --git a/README.md b/README.md
index c5c3eb1a3..5cc6e4458 100644
--- a/README.md
+++ b/README.md
@@ -4,25 +4,46 @@
# stable-diffusion.cpp
+
+

+
+
Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
***Note that this project is under active development. \
API and command-line option may change frequently.***
+## 🔥Important News
+
+* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**
+ 👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
+
+* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**
+ 👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
+
+* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**
+ 👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
+
+* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**
+ 👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)
+
## Features
-- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
- Super lightweight and without external dependencies
- Supported models
- Image Models
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
+ - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- [Flux-dev/Flux-schnell](./docs/flux.md)
- [Chroma](./docs/chroma.md)
+ - [Chroma1-Radiance](./docs/chroma_radiance.md)
+ - [Qwen Image](./docs/qwen_image.md)
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
+ - [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
- Video Models
- [Wan2.1/Wan2.2](./docs/wan.md)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
@@ -31,14 +52,22 @@ API and command-line option may change frequently.***
- Latent Consistency Models support (LCM/LCM-LoRA)
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
-- 16-bit, 32-bit float support
-- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
-- Accelerated memory-efficient CPU inference
- - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
-- AVX, AVX2 and AVX512 support for x86 architectures
-- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration.
-- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
- - No need to convert to `.ggml` or `.gguf` anymore!
+- Supported backends
+ - CPU (AVX, AVX2 and AVX512 support for x86 architectures)
+ - CUDA
+ - Vulkan
+ - Metal
+ - OpenCL
+ - SYCL
+- Supported weight formats
+ - Pytorch checkpoint (`.ckpt` or `.pth`)
+ - Safetensors (`./safetensors`)
+ - GGUF (`.gguf`)
+- Supported platforms
+ - Linux
+ - Mac OS
+ - Windows
+ - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
- Flash Attention for memory usage optimization
- Negative prompt
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
@@ -54,372 +83,45 @@ API and command-line option may change frequently.***
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
- Embedds generation parameters into png output as webui-compatible text string
-- Supported platforms
- - Linux
- - Mac OS
- - Windows
- - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
-## Usage
+## Quick Start
-For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
-If the built product does not meet your requirements, you can choose to build it manually.
+### Get the sd executable
-### Get the Code
+- Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
+- Or build from source by following the [build guide](./docs/build.md)
-```
-git clone --recursive https://github.com/leejet/stable-diffusion.cpp
-cd stable-diffusion.cpp
-```
+### Download model weights
-- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
-
-```
-cd stable-diffusion.cpp
-git pull origin master
-git submodule init
-git submodule update
-```
-
-### Download weights
-
-- download original weights(.ckpt or .safetensors). For example
- - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
+- download weights(.ckpt or .safetensors or .gguf). For example
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
- - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
- - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
-
- ```shell
- curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
- # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
- # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
- # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
- ```
-
-### Build
-
-#### Build from scratch
-
-```shell
-mkdir build
-cd build
-cmake ..
-cmake --build . --config Release
-```
-
-##### Using OpenBLAS
-
-```
-cmake .. -DGGML_OPENBLAS=ON
-cmake --build . --config Release
-```
-
-##### Using CUDA
-
-This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
-
-```
-cmake .. -DSD_CUDA=ON
-cmake --build . --config Release
-```
-
-##### Using HipBLAS
-This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
-To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
-
-Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
-
-```
-if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
-if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-cmake --build . --config Release
-```
-
-##### Using MUSA
-
-This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
-
-```bash
-cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
-cmake --build . --config Release
-```
-
-##### Using Metal
-
-Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
-
-```
-cmake .. -DSD_METAL=ON
-cmake --build . --config Release
-```
-
-##### Using Vulkan
-
-Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
-```
-cmake .. -DSD_VULKAN=ON
-cmake --build . --config Release
-```
-
-##### Using OpenCL (for Adreno GPU)
-
-Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
-
-To build for Windows ARM please refers to [Windows 11 Arm64
-](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
-
-Building for Android:
-
- Android NDK:
- Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
-
-Setup OpenCL Dependencies for NDK:
-
-You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
-
-* OpenCL Headers:
- ```bash
- # In a temporary working directory
- git clone https://github.com/KhronosGroup/OpenCL-Headers
- cd OpenCL-Headers
- # Replace with your actual NDK installation path
- # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
- sudo cp -r CL /toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
- cd ..
+ ```sh
+ curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
```
-* OpenCL ICD Loader:
- ```bash
- # In the same temporary working directory
- git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
- cd OpenCL-ICD-Loader
- mkdir build_ndk && cd build_ndk
-
- # Replace in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
- cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
- -DCMAKE_TOOLCHAIN_FILE=/build/cmake/android.toolchain.cmake \
- -DOPENCL_ICD_LOADER_HEADERS_DIR=/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
- -DANDROID_ABI=arm64-v8a \
- -DANDROID_PLATFORM=24 \
- -DANDROID_STL=c++_shared
-
- ninja
- # Replace
- # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
- sudo cp libOpenCL.so /toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
- cd ../..
- ```
-
-Build `stable-diffusion.cpp` for Android with OpenCL:
-
-```bash
-mkdir build-android && cd build-android
-
-# Replace with your actual NDK installation path
-# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
-cmake .. -G Ninja \
- -DCMAKE_TOOLCHAIN_FILE=/build/cmake/android.toolchain.cmake \
- -DANDROID_ABI=arm64-v8a \
- -DANDROID_PLATFORM=android-28 \
- -DGGML_OPENMP=OFF \
- -DSD_OPENCL=ON
-
-ninja
-```
-*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
-
-##### Using SYCL
-
-Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
-
-```
-# Export relevant ENV variables
-source /opt/intel/oneapi/setvars.sh
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
-cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-
-cmake --build . --config Release
-```
-
-Example of text2img by using SYCL backend:
-
-- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
-
-- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
-
-
-
-
-
-
-
-##### Using Flash Attention
-
-Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
-eg.:
- - flux 768x768 ~600mb
- - SD2 768x768 ~1400mb
-
-For most backends, it slows things down, but for cuda it generally speeds it up too.
-At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
-
-Run by adding `--diffusion-fa` to the arguments and watch for:
-```
-[INFO ] stable-diffusion.cpp:312 - Using flash attention in the diffusion model
-```
-and the compute buffer shrink in the debug log:
-```
-[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
-```
-
-### Run
-
-```
-usage: ./bin/sd [arguments]
-
-arguments:
- -h, --help show this help message and exit
- -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, convert], default: img_gen
- -t, --threads N number of threads to use during computation (default: -1)
- If threads <= 0, then threads will be set to the number of CPU physical cores
- --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
- -m, --model [MODEL] path to full model
- --diffusion-model path to the standalone diffusion model
- --high-noise-diffusion-model path to the standalone high noise diffusion model
- --clip_l path to the clip-l text encoder
- --clip_g path to the clip-g text encoder
- --clip_vision path to the clip-vision encoder
- --t5xxl path to the t5xxl text encoder
- --vae [VAE] path to vae
- --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
- --control-net [CONTROL_PATH] path to control net model
- --embd-dir [EMBEDDING_PATH] path to embeddings
- --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
- --upscale-repeats Run the ESRGAN upscaler this many times (default 1)
- --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
- If not specified, the default is the type of the weight file
- --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
- --lora-model-dir [DIR] lora model directory
- -i, --init-img [IMAGE] path to the init image, required by img2img
- --mask [MASK] path to the mask image, required by img2img with mask
- -i, --end-img [IMAGE] path to the end image, required by flf2v
- --control-image [IMAGE] path to image condition, control net
- -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)
- --control-video [PATH] path to control video frames, It must be a directory path.
- The video frames inside should be stored as images in lexicographical (character) order
- For example, if the control video path is `frames`, the directory contain images such as 00.png, 01.png, 鈥?etc.
- --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
- -o, --output OUTPUT path to write result image to (default: ./output.png)
- -p, --prompt [PROMPT] the prompt to render
- -n, --negative-prompt PROMPT the negative prompt (default: "")
- --cfg-scale SCALE unconditional guidance scale: (default: 7.0)
- --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
- --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)
- --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)
- 0 means disabled, a value of 2.5 is nice for sd3.5 medium
- --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)
- --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
- --skip-layer-start START SLG enabling point: (default: 0.01)
- --skip-layer-end END SLG disabling point: (default: 0.2)
- --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
- --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
- sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise)
- --timestep-shift N shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
- --steps STEPS number of sample steps (default: 20)
- --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)
- --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
- --high-noise-guidance SCALE (high noise) distilled guidance scale for models with guidance input (default: 3.5)
- --high-noise-slg-scale SCALE (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
- 0 means disabled, a value of 2.5 is nice for sd3.5 medium
- --high-noise-eta SCALE (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)
- --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])
- --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)
- --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)
- --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)
- --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
- (high noise) sampling method (default: "euler_a")
- --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)
- SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
- --strength STRENGTH strength for noising/unnoising (default: 0.75)
- --control-strength STRENGTH strength to apply Control Net (default: 0.9)
- 1.0 corresponds to full destruction of information in init image
- -H, --height H image height, in pixel space (default: 512)
- -W, --width W image width, in pixel space (default: 512)
- --rng {std_default, cuda} RNG (default: cuda)
- -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
- -b, --batch-count COUNT number of images to generate
- --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
- <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
- --vae-tiling process vae in tiles to reduce memory usage
- --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)
- --vae-relative-tile-size [X]x[Y] relative tile size for vae tiling, in fraction of image size if < 1, in number of tiles per dim if >=1 (overrides --vae-tile-size)
- --vae-tile-overlap OVERLAP tile overlap for vae tiling, in fraction of tile size (default: 0.5)
- --vae-on-cpu keep vae in cpu (for low vram)
- --clip-on-cpu keep clip in cpu (for low vram)
- --diffusion-fa use flash attention in the diffusion model (for low vram)
- Might lower quality, since it implies converting k and v to f16.
- This might crash if it is not supported by the backend.
- --diffusion-conv-direct use Conv2d direct in the diffusion model
- This might crash if it is not supported by the backend.
- --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)
- This might crash if it is not supported by the backend.
- --control-net-cpu keep controlnet in cpu (for low vram)
- --canny apply canny preprocessor (edge detection)
- --color colors the logging tags according to level
- --chroma-disable-dit-mask disable dit mask for chroma
- --chroma-enable-t5-mask enable t5 mask for chroma
- --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
- --video-frames video frames (default: 1)
- --fps fps (default: 24)
- --moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)
- only enabled if `--high-noise-steps` is set to -1
- --flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)
- --vace-strength wan vace strength
- --photo-maker path to PHOTOMAKER model
- --pm-id-images-dir [DIR] path to PHOTOMAKER input id images dir
- --pm-id-embed-path [PATH] path to PHOTOMAKER v2 id embed
- --pm-style-strength strength for keeping PHOTOMAKER input identity (default: 20)
- -v, --verbose print extra info
-```
-
-#### txt2img example
+### Generate an image with just one command
```sh
-./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
-# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
-# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
-# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
-# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v --clip-on-cpu
-# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v --clip-on-cpu
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
```
-Using formats of different precisions will yield results of varying quality.
-
-| f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
-| ---- |---- |---- |---- |---- |---- |---- |
-|  | | | | | | |
-
-#### img2img example
+***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
-- `./output.png` is the image generated from the above txt2img pipeline
+## Performance
-
-```
-./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
-```
-
-
-
-
+If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
## More Guides
+- [SD1.x/SD2.x/SDXL](./docs/sd.md)
+- [SD3/SD3.5](./docs/sd3.md)
+- [Flux-dev/Flux-schnell](./docs/flux.md)
+- [FLUX.1-Kontext-dev](./docs/kontext.md)
+- [Chroma](./docs/chroma.md)
+- [🔥Qwen Image](./docs/qwen_image.md)
+- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
+- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
@@ -449,6 +151,8 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
- [LocalAI](https://github.com/mudler/LocalAI)
+- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
+- [KoboldCpp](https://github.com/LostRuins/koboldcpp)
## Contributors
@@ -462,7 +166,8 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
## References
-- [ggml](https://github.com/ggerganov/ggml)
+- [ggml](https://github.com/ggml-org/ggml)
+- [diffusers](https://github.com/huggingface/diffusers)
- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
@@ -473,4 +178,4 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
- [generative-models](https://github.com/Stability-AI/generative-models/)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
-- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
\ No newline at end of file
+- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
diff --git a/assets/flux/chroma1-radiance.png b/assets/flux/chroma1-radiance.png
new file mode 100644
index 000000000..1dd4a524a
Binary files /dev/null and b/assets/flux/chroma1-radiance.png differ
diff --git a/assets/qwen/example.png b/assets/qwen/example.png
new file mode 100644
index 000000000..f51bbd519
Binary files /dev/null and b/assets/qwen/example.png differ
diff --git a/assets/qwen/qwen_image_edit.png b/assets/qwen/qwen_image_edit.png
new file mode 100644
index 000000000..c2a31eda9
Binary files /dev/null and b/assets/qwen/qwen_image_edit.png differ
diff --git a/assets/qwen/qwen_image_edit_2509.png b/assets/qwen/qwen_image_edit_2509.png
new file mode 100644
index 000000000..442ba9b3c
Binary files /dev/null and b/assets/qwen/qwen_image_edit_2509.png differ
diff --git a/clip.hpp b/clip.hpp
index 546704c8b..eb37638cc 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -6,7 +6,7 @@
/*================================================== CLIPTokenizer ===================================================*/
-std::pair, std::string> extract_and_remove_lora(std::string text) {
+__STATIC_INLINE__ std::pair, std::string> extract_and_remove_lora(std::string text) {
std::regex re("]+)>");
std::smatch matches;
std::unordered_map filename2multiplier;
@@ -31,7 +31,7 @@ std::pair, std::string> extract_and_remov
return std::make_pair(filename2multiplier, text);
}
-std::vector> bytes_to_unicode() {
+__STATIC_INLINE__ std::vector> bytes_to_unicode() {
std::vector> byte_unicode_pairs;
std::set byte_set;
for (int b = static_cast('!'); b <= static_cast('~'); ++b) {
@@ -451,16 +451,16 @@ struct CLIPMLP : public GGMLBlock {
}
}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, n_token, d_model]
auto fc1 = std::dynamic_pointer_cast(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast(blocks["fc2"]);
x = fc1->forward(ctx, x);
if (use_gelu) {
- x = ggml_gelu_inplace(ctx, x);
+ x = ggml_gelu_inplace(ctx->ggml_ctx, x);
} else {
- x = ggml_gelu_quick_inplace(ctx, x);
+ x = ggml_gelu_quick_inplace(ctx->ggml_ctx, x);
}
x = fc2->forward(ctx, x);
return x;
@@ -476,11 +476,12 @@ struct CLIPLayer : public GGMLBlock {
public:
CLIPLayer(int64_t d_model,
int64_t n_head,
- int64_t intermediate_size)
+ int64_t intermediate_size,
+ bool proj_in = false)
: d_model(d_model),
n_head(n_head),
intermediate_size(intermediate_size) {
- blocks["self_attn"] = std::shared_ptr(new MultiheadAttention(d_model, n_head, true, true));
+ blocks["self_attn"] = std::shared_ptr(new MultiheadAttention(d_model, n_head, true, true, proj_in));
blocks["layer_norm1"] = std::shared_ptr(new LayerNorm(d_model));
blocks["layer_norm2"] = std::shared_ptr(new LayerNorm(d_model));
@@ -488,15 +489,15 @@ struct CLIPLayer : public GGMLBlock {
blocks["mlp"] = std::shared_ptr(new CLIPMLP(d_model, intermediate_size));
}
- struct ggml_tensor* forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, bool mask = true) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, bool mask = true) {
// x: [N, n_token, d_model]
auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]);
auto layer_norm1 = std::dynamic_pointer_cast(blocks["layer_norm1"]);
auto layer_norm2 = std::dynamic_pointer_cast(blocks["layer_norm2"]);
auto mlp = std::dynamic_pointer_cast(blocks["mlp"]);
- x = ggml_add(ctx, x, self_attn->forward(ctx, backend, layer_norm1->forward(ctx, x), mask));
- x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
+ x = ggml_add(ctx->ggml_ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
+ x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
return x;
}
};
@@ -509,16 +510,16 @@ struct CLIPEncoder : public GGMLBlock {
CLIPEncoder(int64_t n_layer,
int64_t d_model,
int64_t n_head,
- int64_t intermediate_size)
+ int64_t intermediate_size,
+ bool proj_in = false)
: n_layer(n_layer) {
for (int i = 0; i < n_layer; i++) {
std::string name = "layers." + std::to_string(i);
- blocks[name] = std::shared_ptr(new CLIPLayer(d_model, n_head, intermediate_size));
+ blocks[name] = std::shared_ptr(new CLIPLayer(d_model, n_head, intermediate_size, proj_in));
}
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
int clip_skip = -1,
bool mask = true) {
@@ -536,7 +537,7 @@ struct CLIPEncoder : public GGMLBlock {
}
std::string name = "layers." + std::to_string(i);
auto layer = std::dynamic_pointer_cast(blocks[name]);
- x = layer->forward(ctx, backend, x, mask); // [N, n_token, d_model]
+ x = layer->forward(ctx, x, mask); // [N, n_token, d_model]
// LOG_DEBUG("layer %d", i);
}
return x;
@@ -550,10 +551,10 @@ class CLIPEmbeddings : public GGMLBlock {
int64_t num_positions;
bool force_clip_f32;
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type token_wtype = GGML_TYPE_F32;
if (!force_clip_f32) {
- token_wtype = get_type(prefix + "token_embedding.weight", tensor_types, GGML_TYPE_F32);
+ token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
if (!support_get_rows(token_wtype)) {
token_wtype = GGML_TYPE_F32;
}
@@ -578,7 +579,7 @@ class CLIPEmbeddings : public GGMLBlock {
return params["token_embedding.weight"];
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids,
struct ggml_tensor* custom_embed_weight) {
// input_ids: [N, n_token]
@@ -586,12 +587,12 @@ class CLIPEmbeddings : public GGMLBlock {
auto position_embed_weight = params["position_embedding.weight"];
GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
- input_ids = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
- auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids);
- token_embedding = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
+ input_ids = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
+ auto token_embedding = ggml_get_rows(ctx->ggml_ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
+ token_embedding = ggml_reshape_3d(ctx->ggml_ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
// token_embedding + position_embedding
- auto x = ggml_add(ctx,
+ auto x = ggml_add(ctx->ggml_ctx,
token_embedding,
position_embed_weight); // [N, n_token, embed_dim]
return x;
@@ -606,7 +607,8 @@ class CLIPVisionEmbeddings : public GGMLBlock {
int64_t image_size;
int64_t num_patches;
int64_t num_positions;
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type patch_wtype = GGML_TYPE_F16;
enum ggml_type class_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32;
@@ -629,7 +631,7 @@ class CLIPVisionEmbeddings : public GGMLBlock {
num_positions = num_patches + 1;
}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) {
// pixel_values: [N, num_channels, image_size, image_size]
// return: [N, num_positions, embed_dim]
GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@@ -641,18 +643,18 @@ class CLIPVisionEmbeddings : public GGMLBlock {
// concat(patch_embedding, class_embedding) + position_embedding
struct ggml_tensor* patch_embedding;
int64_t N = pixel_values->ne[3];
- patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, NULL, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
- patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
- patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
- patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
-
- struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N);
- class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim]
- class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
-
- struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
- x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
- x = ggml_add(ctx, x, position_embed_weight);
+ patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
+ patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
+ patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
+ patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
+
+ struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
+ class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim]
+ class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
+
+ struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
+ x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
+ x = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
return x; // [N, num_positions, embed_dim]
}
};
@@ -669,7 +671,7 @@ enum CLIPVersion {
class CLIPTextModel : public GGMLBlock {
protected:
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
if (version == OPEN_CLIP_VIT_BIGG_14) {
enum ggml_type wtype = GGML_TYPE_F32;
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
@@ -690,7 +692,8 @@ class CLIPTextModel : public GGMLBlock {
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
- bool force_clip_f32 = false)
+ bool force_clip_f32 = false,
+ bool proj_in = false)
: version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024;
@@ -705,7 +708,7 @@ class CLIPTextModel : public GGMLBlock {
}
blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
- blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
+ blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size));
}
@@ -714,8 +717,7 @@ class CLIPTextModel : public GGMLBlock {
return embeddings->get_token_embed_weight();
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids,
struct ggml_tensor* tkn_embeddings,
size_t max_token_idx = 0,
@@ -727,16 +729,16 @@ class CLIPTextModel : public GGMLBlock {
auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]);
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
- x = encoder->forward(ctx, backend, x, return_pooled ? -1 : clip_skip, true);
+ x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
if (return_pooled || with_final_ln) {
x = final_layer_norm->forward(ctx, x);
}
if (return_pooled) {
auto text_projection = params["text_projection"];
- ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
- if (text_projection != NULL) {
- pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
+ ggml_tensor* pooled = ggml_view_1d(ctx->ggml_ctx, x, hidden_size, x->nb[1] * max_token_idx);
+ if (text_projection != nullptr) {
+ pooled = ggml_ext_linear(ctx->ggml_ctx, pooled, text_projection, nullptr);
} else {
LOG_DEBUG("identity projection");
}
@@ -760,7 +762,7 @@ class CLIPVisionModel : public GGMLBlock {
int32_t n_layer = 24;
public:
- CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) {
+ CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool proj_in = false) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1280;
intermediate_size = 5120;
@@ -775,12 +777,11 @@ class CLIPVisionModel : public GGMLBlock {
blocks["embeddings"] = std::shared_ptr(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
blocks["pre_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size));
- blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
+ blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
blocks["post_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size));
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values,
bool return_pooled = true,
int clip_skip = -1) {
@@ -792,14 +793,14 @@ class CLIPVisionModel : public GGMLBlock {
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x);
- x = encoder->forward(ctx, backend, x, clip_skip, false);
+ x = encoder->forward(ctx, x, clip_skip, false);
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
auto last_hidden_state = x;
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
GGML_ASSERT(x->ne[3] == 1);
if (return_pooled) {
- ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
+ ggml_tensor* pooled = ggml_cont(ctx->ggml_ctx, ggml_view_2d(ctx->ggml_ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
return pooled; // [N, hidden_size]
} else {
// return x; // [N, n_token, hidden_size]
@@ -814,8 +815,8 @@ class CLIPProjection : public UnaryBlock {
int64_t out_features;
bool transpose_weight;
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
- enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+ enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
if (transpose_weight) {
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
} else {
@@ -831,12 +832,12 @@ class CLIPProjection : public UnaryBlock {
out_features(out_features),
transpose_weight(transpose_weight) {}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
struct ggml_tensor* w = params["weight"];
if (transpose_weight) {
- w = ggml_cont(ctx, ggml_transpose(ctx, w));
+ w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
}
- return ggml_nn_linear(ctx, x, w, NULL);
+ return ggml_ext_linear(ctx->ggml_ctx, x, w, nullptr);
}
};
@@ -848,7 +849,8 @@ class CLIPVisionModelProjection : public GGMLBlock {
public:
CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
- bool transpose_proj_w = false) {
+ bool transpose_proj_w = false,
+ bool proj_in = false) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1280;
projection_dim = 1024;
@@ -856,12 +858,11 @@ class CLIPVisionModelProjection : public GGMLBlock {
hidden_size = 1664;
}
- blocks["vision_model"] = std::shared_ptr(new CLIPVisionModel(version));
+ blocks["vision_model"] = std::shared_ptr(new CLIPVisionModel(version, proj_in));
blocks["visual_projection"] = std::shared_ptr(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values,
bool return_pooled = true,
int clip_skip = -1) {
@@ -870,7 +871,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
auto vision_model = std::dynamic_pointer_cast(blocks["vision_model"]);
auto visual_projection = std::dynamic_pointer_cast(blocks["visual_projection"]);
- auto x = vision_model->forward(ctx, backend, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
+ auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
if (return_pooled) {
x = visual_projection->forward(ctx, x); // [N, projection_dim]
@@ -885,16 +886,27 @@ struct CLIPTextModelRunner : public GGMLRunner {
CLIPTextModelRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types,
+ const String2TensorStorage& tensor_storage_map,
const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
bool force_clip_f32 = false)
- : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
- model.init(params_ctx, tensor_types, prefix);
+ : GGMLRunner(backend, offload_params_to_cpu) {
+ bool proj_in = false;
+ for (const auto& [name, tensor_storage] : tensor_storage_map) {
+ if (!starts_with(name, prefix)) {
+ continue;
+ }
+ if (contains(name, "self_attn.in_proj")) {
+ proj_in = true;
+ break;
+ }
+ }
+ model = CLIPTextModel(version, with_final_ln, force_clip_f32, proj_in);
+ model.init(params_ctx, tensor_storage_map, prefix);
}
- std::string get_desc() {
+ std::string get_desc() override {
return "clip";
}
@@ -902,8 +914,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
model.get_param_tensors(tensors, prefix);
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids,
struct ggml_tensor* embeddings,
size_t max_token_idx = 0,
@@ -913,15 +924,15 @@ struct CLIPTextModelRunner : public GGMLRunner {
size_t n_token = input_ids->ne[0];
if (input_ids->ne[0] > model.n_token) {
GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
- input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
+ input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
}
- return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
+ return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
int num_custom_embeddings = 0,
- void* custom_embeddings_data = NULL,
+ void* custom_embeddings_data = nullptr,
size_t max_token_idx = 0,
bool return_pooled = false,
int clip_skip = -1) {
@@ -929,9 +940,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
input_ids = to_backend(input_ids);
- struct ggml_tensor* embeddings = NULL;
+ struct ggml_tensor* embeddings = nullptr;
- if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
+ if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) {
auto token_embed_weight = model.get_token_embed_weight();
auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
token_embed_weight->type,
@@ -943,7 +954,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
}
- struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
+ auto runner_ctx = get_context();
+
+ struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states);
@@ -958,7 +971,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
bool return_pooled,
int clip_skip,
ggml_tensor** output,
- ggml_context* output_ctx = NULL) {
+ ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* {
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
};
diff --git a/common.hpp b/common.hpp
index bf4da24ec..c68ddafe5 100644
--- a/common.hpp
+++ b/common.hpp
@@ -23,12 +23,12 @@ class DownSampleBlock : public GGMLBlock {
}
}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
if (vae_downsample) {
auto conv = std::dynamic_pointer_cast(blocks["conv"]);
- x = ggml_pad(ctx, x, 1, 1, 0, 0);
+ x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
x = conv->forward(ctx, x);
} else {
auto conv = std::dynamic_pointer_cast(blocks["op"]);
@@ -52,12 +52,12 @@ class UpSampleBlock : public GGMLBlock {
blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
auto conv = std::dynamic_pointer_cast(blocks["conv"]);
- x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
- x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
+ x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
+ x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
return x;
}
};
@@ -121,7 +121,7 @@ class ResBlock : public GGMLBlock {
}
}
- virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
+ virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
// [N, c, t, h, w] => [N, c, t, h * w]
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@@ -131,38 +131,38 @@ class ResBlock : public GGMLBlock {
auto out_layers_0 = std::dynamic_pointer_cast(blocks["out_layers.0"]);
auto out_layers_3 = std::dynamic_pointer_cast(blocks["out_layers.3"]);
- if (emb == NULL) {
+ if (emb == nullptr) {
GGML_ASSERT(skip_t_emb);
}
// in_layers
auto h = in_layers_0->forward(ctx, x);
- h = ggml_silu_inplace(ctx, h);
+ h = ggml_silu_inplace(ctx->ggml_ctx, h);
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
// emb_layers
if (!skip_t_emb) {
auto emb_layer_1 = std::dynamic_pointer_cast(blocks["emb_layers.1"]);
- auto emb_out = ggml_silu(ctx, emb);
+ auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
if (dims == 2) {
- emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
+ emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
} else {
- emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
+ emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
if (exchange_temb_dims) {
// emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
- emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
+ emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
}
}
- h = ggml_add(ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+ h = ggml_add(ctx->ggml_ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
// out_layers
h = out_layers_0->forward(ctx, h);
- h = ggml_silu_inplace(ctx, h);
+ h = ggml_silu_inplace(ctx->ggml_ctx, h);
// dropout, skip for inference
h = out_layers_3->forward(ctx, h);
@@ -172,18 +172,18 @@ class ResBlock : public GGMLBlock {
x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
- h = ggml_add(ctx, h, x);
+ h = ggml_add(ctx->ggml_ctx, h, x);
return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
};
-class GEGLU : public GGMLBlock {
+class GEGLU : public UnaryBlock {
protected:
int64_t dim_in;
int64_t dim_out;
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
- enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+ enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_storage_map, GGML_TYPE_F32);
enum ggml_type bias_wtype = GGML_TYPE_F32;
params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
@@ -193,46 +193,80 @@ class GEGLU : public GGMLBlock {
GEGLU(int64_t dim_in, int64_t dim_out)
: dim_in(dim_in), dim_out(dim_out) {}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out]
struct ggml_tensor* w = params["proj.weight"];
struct ggml_tensor* b = params["proj.bias"];
- auto x_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in]
- auto x_b = ggml_view_1d(ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in]
- auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ]
- auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ]
+ auto x_w = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in]
+ auto x_b = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in]
+ auto gate_w = ggml_view_2d(ctx->ggml_ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ]
+ auto gate_b = ggml_view_1d(ctx->ggml_ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ]
auto x_in = x;
- x = ggml_nn_linear(ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out]
- auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out]
+ x = ggml_ext_linear(ctx->ggml_ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out]
+ auto gate = ggml_ext_linear(ctx->ggml_ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out]
- gate = ggml_gelu_inplace(ctx, gate);
+ gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
- x = ggml_mul(ctx, x, gate); // [ne3, ne2, ne1, dim_out]
+ x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out]
return x;
}
};
+class GELU : public UnaryBlock {
+public:
+ GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
+ blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out, bias));
+ }
+
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+ // x: [ne3, ne2, ne1, dim_in]
+ // return: [ne3, ne2, ne1, dim_out]
+ auto proj = std::dynamic_pointer_cast(blocks["proj"]);
+
+ x = proj->forward(ctx, x);
+ x = ggml_gelu_inplace(ctx->ggml_ctx, x);
+ return x;
+ }
+};
+
class FeedForward : public GGMLBlock {
public:
+ enum class Activation {
+ GEGLU,
+ GELU
+ };
FeedForward(int64_t dim,
int64_t dim_out,
- int64_t mult = 4) {
+ int64_t mult = 4,
+ Activation activation = Activation::GEGLU,
+ bool precision_fix = false) {
int64_t inner_dim = dim * mult;
+ if (activation == Activation::GELU) {
+ blocks["net.0"] = std::shared_ptr(new GELU(dim, inner_dim));
+ } else {
+ blocks["net.0"] = std::shared_ptr(new GEGLU(dim, inner_dim));
+ }
- blocks["net.0"] = std::shared_ptr(new GEGLU(dim, inner_dim));
// net_1 is nn.Dropout(), skip for inference
- blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out));
+ float scale = 1.f;
+ if (precision_fix) {
+ scale = 1.f / 128.f;
+ }
+ // The purpose of the scale here is to prevent NaN issues in certain situations.
+ // For example, when using Vulkan without enabling force_prec_f32,
+ // or when using CUDA but the weights are k-quants.
+ blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out, true, false, false, scale));
}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [ne3, ne2, ne1, dim]
// return: [ne3, ne2, ne1, dim_out]
- auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]);
+ auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]);
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
@@ -247,19 +281,16 @@ class CrossAttention : public GGMLBlock {
int64_t context_dim;
int64_t n_head;
int64_t d_head;
- bool flash_attn;
public:
CrossAttention(int64_t query_dim,
int64_t context_dim,
int64_t n_head,
- int64_t d_head,
- bool flash_attn = false)
+ int64_t d_head)
: n_head(n_head),
d_head(d_head),
query_dim(query_dim),
- context_dim(context_dim),
- flash_attn(flash_attn) {
+ context_dim(context_dim) {
int64_t inner_dim = d_head * n_head;
blocks["to_q"] = std::shared_ptr(new Linear(query_dim, inner_dim, false));
@@ -270,8 +301,7 @@ class CrossAttention : public GGMLBlock {
// to_out_1 is nn.Dropout(), skip for inference
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, n_token, query_dim]
@@ -291,7 +321,7 @@ class CrossAttention : public GGMLBlock {
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
- x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, false, false, flash_attn); // [N, n_token, inner_dim]
+ x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
return x;
@@ -309,16 +339,15 @@ class BasicTransformerBlock : public GGMLBlock {
int64_t n_head,
int64_t d_head,
int64_t context_dim,
- bool ff_in = false,
- bool flash_attn = false)
+ bool ff_in = false)
: n_head(n_head), d_head(d_head), ff_in(ff_in) {
// disable_self_attn is always False
// disable_temporal_crossattention is always False
// switch_temporal_ca_to_sa is always False
// inner_dim is always None or equal to dim
// gated_ff is always True
- blocks["attn1"] = std::shared_ptr(new CrossAttention(dim, dim, n_head, d_head, flash_attn));
- blocks["attn2"] = std::shared_ptr(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn));
+ blocks["attn1"] = std::shared_ptr(new CrossAttention(dim, dim, n_head, d_head));
+ blocks["attn2"] = std::shared_ptr(new CrossAttention(dim, context_dim, n_head, d_head));
blocks["ff"] = std::shared_ptr(new FeedForward(dim, dim));
blocks["norm1"] = std::shared_ptr(new LayerNorm(dim));
blocks["norm2"] = std::shared_ptr(new LayerNorm(dim));
@@ -330,8 +359,7 @@ class BasicTransformerBlock : public GGMLBlock {
}
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, n_token, query_dim]
@@ -353,21 +381,21 @@ class BasicTransformerBlock : public GGMLBlock {
x = norm_in->forward(ctx, x);
x = ff_in->forward(ctx, x);
// self.is_res is always True
- x = ggml_add(ctx, x, x_skip);
+ x = ggml_add(ctx->ggml_ctx, x, x_skip);
}
auto r = x;
x = norm1->forward(ctx, x);
- x = attn1->forward(ctx, backend, x, x); // self-attention
- x = ggml_add(ctx, x, r);
+ x = attn1->forward(ctx, x, x); // self-attention
+ x = ggml_add(ctx->ggml_ctx, x, r);
r = x;
x = norm2->forward(ctx, x);
- x = attn2->forward(ctx, backend, x, context); // cross-attention
- x = ggml_add(ctx, x, r);
+ x = attn2->forward(ctx, x, context); // cross-attention
+ x = ggml_add(ctx->ggml_ctx, x, r);
r = x;
x = norm3->forward(ctx, x);
x = ff->forward(ctx, x);
- x = ggml_add(ctx, x, r);
+ x = ggml_add(ctx->ggml_ctx, x, r);
return x;
}
@@ -380,6 +408,23 @@ class SpatialTransformer : public GGMLBlock {
int64_t d_head;
int64_t depth = 1; // 1
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
+ bool use_linear = false;
+
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+ auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+ if (iter != tensor_storage_map.end()) {
+ int64_t inner_dim = n_head * d_head;
+ if (iter->second.n_dims == 4 && use_linear) {
+ use_linear = false;
+ blocks["proj_in"] = std::make_shared(in_channels, inner_dim, std::pair{1, 1});
+ blocks["proj_out"] = std::make_shared(inner_dim, in_channels, std::pair{1, 1});
+ } else if (iter->second.n_dims == 2 && !use_linear) {
+ use_linear = true;
+ blocks["proj_in"] = std::make_shared(in_channels, inner_dim);
+ blocks["proj_out"] = std::make_shared(inner_dim, in_channels);
+ }
+ }
+ }
public:
SpatialTransformer(int64_t in_channels,
@@ -387,35 +432,42 @@ class SpatialTransformer : public GGMLBlock {
int64_t d_head,
int64_t depth,
int64_t context_dim,
- bool flash_attn = false)
+ bool use_linear)
: in_channels(in_channels),
n_head(n_head),
d_head(d_head),
depth(depth),
- context_dim(context_dim) {
- // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
+ context_dim(context_dim),
+ use_linear(use_linear) {
// disable_self_attn is always False
int64_t inner_dim = n_head * d_head; // in_channels
blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels));
- blocks["proj_in"] = std::shared_ptr(new Conv2d(in_channels, inner_dim, {1, 1}));
+ if (use_linear) {
+ blocks["proj_in"] = std::shared_ptr(new Linear(in_channels, inner_dim));
+ } else {
+ blocks["proj_in"] = std::shared_ptr(new Conv2d(in_channels, inner_dim, {1, 1}));
+ }
for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i);
- blocks[name] = std::shared_ptr(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
+ blocks[name] = std::shared_ptr(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
}
- blocks["proj_out"] = std::shared_ptr(new Conv2d(inner_dim, in_channels, {1, 1}));
+ if (use_linear) {
+ blocks["proj_out"] = std::shared_ptr(new Linear(inner_dim, in_channels));
+ } else {
+ blocks["proj_out"] = std::shared_ptr(new Conv2d(inner_dim, in_channels, {1, 1}));
+ }
}
- virtual struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, in_channels, h, w]
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
auto norm = std::dynamic_pointer_cast(blocks["norm"]);
- auto proj_in = std::dynamic_pointer_cast(blocks["proj_in"]);
- auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]);
+ auto proj_in = std::dynamic_pointer_cast(blocks["proj_in"]);
+ auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]);
auto x_in = x;
int64_t n = x->ne[3];
@@ -424,32 +476,45 @@ class SpatialTransformer : public GGMLBlock {
int64_t inner_dim = n_head * d_head;
x = norm->forward(ctx, x);
- x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
-
- x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
- x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
+ if (use_linear) {
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
+ x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
+ x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
+ } else {
+ x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
+ x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
+ }
for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i);
auto transformer_block = std::dynamic_pointer_cast(blocks[name]);
- x = transformer_block->forward(ctx, backend, x, context);
+ x = transformer_block->forward(ctx, x, context);
}
- x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
- x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
+ if (use_linear) {
+ // proj_out
+ x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
- // proj_out
- x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
+ } else {
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
+
+ // proj_out
+ x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
+ }
- x = ggml_add(ctx, x, x_in);
+ x = ggml_add(ctx->ggml_ctx, x, x_in);
return x;
}
};
class AlphaBlender : public GGMLBlock {
protected:
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
enum ggml_type wtype = GGML_TYPE_F32;
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
@@ -458,7 +523,7 @@ class AlphaBlender : public GGMLBlock {
float get_alpha() {
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
// so learned_with_images is same as learned
- float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
+ float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
return sigmoid(alpha);
}
@@ -469,14 +534,14 @@ class AlphaBlender : public GGMLBlock {
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x_spatial,
struct ggml_tensor* x_temporal) {
// image_only_indicator is always tensor([0.])
float alpha = get_alpha();
- auto x = ggml_add(ctx,
- ggml_scale(ctx, x_spatial, alpha),
- ggml_scale(ctx, x_temporal, 1.0f - alpha));
+ auto x = ggml_add(ctx->ggml_ctx,
+ ggml_scale(ctx->ggml_ctx, x_spatial, alpha),
+ ggml_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
return x;
}
};
@@ -494,7 +559,7 @@ class VideoResBlock : public ResBlock {
blocks["time_mixer"] = std::shared_ptr(new AlphaBlender());
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* emb,
int num_video_frames) {
@@ -512,18 +577,18 @@ class VideoResBlock : public ResBlock {
int64_t H = x->ne[1];
int64_t W = x->ne[0];
- x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
- x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
auto x_mix = x;
- emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
+ emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
x = time_stack->forward(ctx, x, emb); // b t c (h w)
x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
- x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
- x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
return x;
}
diff --git a/conditioner.hpp b/conditioner.hpp
index df7ed0cc8..93e0c2818 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -2,42 +2,47 @@
#define __CONDITIONER_HPP__
#include "clip.hpp"
+#include "qwenvl.hpp"
#include "t5.hpp"
struct SDCondition {
- struct ggml_tensor* c_crossattn = NULL; // aka context
- struct ggml_tensor* c_vector = NULL; // aka y
- struct ggml_tensor* c_concat = NULL;
+ struct ggml_tensor* c_crossattn = nullptr; // aka context
+ struct ggml_tensor* c_vector = nullptr; // aka y
+ struct ggml_tensor* c_concat = nullptr;
SDCondition() = default;
SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat)
: c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat) {}
};
+struct ConditionerParams {
+ std::string text;
+ int clip_skip = -1;
+ int width = -1;
+ int height = -1;
+ int adm_in_channels = -1;
+ bool zero_out_masked = false;
+ int num_input_imgs = 0; // for photomaker
+ std::vector ref_images = {}; // for qwen image edit
+};
+
struct Conditioner {
virtual SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int adm_in_channels = -1,
- bool zero_out_masked = false) = 0;
- virtual void alloc_params_buffer() = 0;
- virtual void free_params_buffer() = 0;
- virtual void get_param_tensors(std::map& tensors) = 0;
- virtual size_t get_params_buffer_size() = 0;
+ const ConditionerParams& conditioner_params) = 0;
+ virtual void alloc_params_buffer() = 0;
+ virtual void free_params_buffer() = 0;
+ virtual void get_param_tensors(std::map& tensors) = 0;
+ virtual size_t get_params_buffer_size() = 0;
virtual std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int num_input_imgs,
- int adm_in_channels = -1,
- bool zero_out_masked = false) = 0;
+ const ConditionerParams& conditioner_params) {
+ GGML_ABORT("Not implemented yet!");
+ }
virtual std::string remove_trigger_from_prompt(ggml_context* work_ctx,
- const std::string& prompt) = 0;
+ const std::string& prompt) {
+ GGML_ABORT("Not implemented yet!");
+ }
};
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
@@ -58,44 +63,44 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types,
+ const String2TensorStorage& tensor_storage_map,
const std::string& embd_dir,
SDVersion version = VERSION_SD1,
PMVersion pv = PM_VERSION_1)
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
bool force_clip_f32 = embd_dir.size() > 0;
if (sd_version_is_sd1(version)) {
- text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
+ text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
} else if (sd_version_is_sd2(version)) {
- text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
+ text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
} else if (sd_version_is_sdxl(version)) {
- text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
- text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
+ text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+ text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
}
}
- void get_param_tensors(std::map& tensors) {
+ void get_param_tensors(std::map& tensors) override {
text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
if (sd_version_is_sdxl(version)) {
text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
}
}
- void alloc_params_buffer() {
+ void alloc_params_buffer() override {
text_model->alloc_params_buffer();
if (sd_version_is_sdxl(version)) {
text_model2->alloc_params_buffer();
}
}
- void free_params_buffer() {
+ void free_params_buffer() override {
text_model->free_params_buffer();
if (sd_version_is_sdxl(version)) {
text_model2->free_params_buffer();
}
}
- size_t get_params_buffer_size() {
+ size_t get_params_buffer_size() override {
size_t buffer_size = text_model->get_params_buffer_size();
if (sd_version_is_sdxl(version)) {
buffer_size += text_model2->get_params_buffer_size();
@@ -106,7 +111,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
bool load_embedding(std::string embd_name, std::string embd_path, std::vector& bpe_tokens) {
// the order matters
ModelLoader model_loader;
- if (!model_loader.init_from_file(embd_path)) {
+ if (!model_loader.init_from_file_and_convert_name(embd_path)) {
LOG_ERROR("embedding '%s' failed", embd_name.c_str());
return false;
}
@@ -116,11 +121,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
}
struct ggml_init_params params;
params.mem_size = 100 * 1024 * 1024; // max for custom embeddings 100 MB
- params.mem_buffer = NULL;
+ params.mem_buffer = nullptr;
params.no_alloc = false;
struct ggml_context* embd_ctx = ggml_init(params);
- struct ggml_tensor* embd = NULL;
- struct ggml_tensor* embd2 = NULL;
+ struct ggml_tensor* embd = nullptr;
+ struct ggml_tensor* embd2 = nullptr;
auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
if (tensor_storage.ne[0] != text_model->model.hidden_size) {
if (text_model2) {
@@ -399,11 +404,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int adm_in_channels = -1,
bool zero_out_masked = false) {
int64_t t0 = ggml_time_ms();
- struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
- struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
- struct ggml_tensor* chunk_hidden_states1 = NULL; // [n_token, hidden_size]
- struct ggml_tensor* chunk_hidden_states2 = NULL; // [n_token, hidden_size2]
- struct ggml_tensor* pooled = NULL;
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size]
+ struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
+ struct ggml_tensor* chunk_hidden_states1 = nullptr; // [n_token, hidden_size]
+ struct ggml_tensor* chunk_hidden_states2 = nullptr; // [n_token, hidden_size2]
+ struct ggml_tensor* pooled = nullptr;
std::vector hidden_states_vec;
if (clip_skip <= 0) {
@@ -419,7 +424,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
weights.begin() + (chunk_idx + 1) * chunk_len);
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
- struct ggml_tensor* input_ids2 = NULL;
+ struct ggml_tensor* input_ids2 = nullptr;
size_t max_token_idx = 0;
if (sd_version_is_sdxl(version)) {
auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
@@ -457,7 +462,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
clip_skip,
&chunk_hidden_states2, work_ctx);
// concat
- chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
+ chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
if (chunk_idx == 0) {
text_model2->compute(n_threads,
@@ -479,18 +484,18 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states);
{
- float original_mean = ggml_tensor_mean(chunk_hidden_states);
+ float original_mean = ggml_ext_tensor_mean(chunk_hidden_states);
for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) {
for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) {
for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(result, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(result, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(result);
- ggml_tensor_scale(result, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(result);
+ ggml_ext_tensor_scale_inplace(result, (original_mean / new_mean));
}
if (zero_out_masked) {
float* vec = (float*)result->data;
@@ -507,7 +512,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
chunk_hidden_states->ne[0],
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
- ggml_tensor* vec = NULL;
+ ggml_tensor* vec = nullptr;
if (sd_version_is_sdxl(version)) {
int out_dim = 256;
vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
@@ -544,26 +549,20 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
GGML_ASSERT(offset == ggml_nbytes(vec));
}
// print_ggml_tensor(result);
- return SDCondition(hidden_states, vec, NULL);
+ return {hidden_states, vec, nullptr};
}
std::tuple>
get_learned_condition_with_trigger(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int num_input_imgs,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
+ const ConditionerParams& conditioner_params) override {
auto image_tokens = convert_token_to_id(trigger_word);
// if(image_tokens.size() == 1){
// printf(" image token id is: %d \n", image_tokens[0]);
// }
GGML_ASSERT(image_tokens.size() == 1);
- auto tokens_and_weights = tokenize_with_trigger_token(text,
- num_input_imgs,
+ auto tokens_and_weights = tokenize_with_trigger_token(conditioner_params.text,
+ conditioner_params.num_input_imgs,
image_tokens[0],
true);
std::vector& tokens = std::get<0>(tokens_and_weights);
@@ -577,12 +576,20 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
// for(int i = 0; i < clsm.size(); ++i)
// printf("%d ", clsm[i]?1:0);
// printf("\n");
- auto cond = get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, zero_out_masked);
+ auto cond = get_learned_condition_common(work_ctx,
+ n_threads,
+ tokens,
+ weights,
+ conditioner_params.clip_skip,
+ conditioner_params.width,
+ conditioner_params.height,
+ conditioner_params.adm_in_channels,
+ conditioner_params.zero_out_masked);
return std::make_tuple(cond, clsm);
}
std::string remove_trigger_from_prompt(ggml_context* work_ctx,
- const std::string& prompt) {
+ const std::string& prompt) override {
auto image_tokens = convert_token_to_id(trigger_word);
GGML_ASSERT(image_tokens.size() == 1);
auto tokens_and_weights = tokenize(prompt, false);
@@ -595,16 +602,19 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- auto tokens_and_weights = tokenize(text, true);
+ const ConditionerParams& conditioner_params) override {
+ auto tokens_and_weights = tokenize(conditioner_params.text, true);
std::vector& tokens = tokens_and_weights.first;
std::vector& weights = tokens_and_weights.second;
- return get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, zero_out_masked);
+ return get_learned_condition_common(work_ctx,
+ n_threads,
+ tokens,
+ weights,
+ conditioner_params.clip_skip,
+ conditioner_params.width,
+ conditioner_params.height,
+ conditioner_params.adm_in_channels,
+ conditioner_params.zero_out_masked);
}
};
@@ -613,12 +623,24 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
FrozenCLIPVisionEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {})
- : vision_model(OPEN_CLIP_VIT_H_14), GGMLRunner(backend, offload_params_to_cpu) {
- vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
+ const String2TensorStorage& tensor_storage_map = {})
+ : GGMLRunner(backend, offload_params_to_cpu) {
+ std::string prefix = "cond_stage_model.transformer";
+ bool proj_in = false;
+ for (const auto& [name, tensor_storage] : tensor_storage_map) {
+ if (!starts_with(name, prefix)) {
+ continue;
+ }
+ if (contains(name, "self_attn.in_proj")) {
+ proj_in = true;
+ break;
+ }
+ }
+ vision_model = CLIPVisionModelProjection(OPEN_CLIP_VIT_H_14, false, proj_in);
+ vision_model.init(params_ctx, tensor_storage_map, prefix);
}
- std::string get_desc() {
+ std::string get_desc() override {
return "clip_vision";
}
@@ -631,7 +653,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
pixel_values = to_backend(pixel_values);
- struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, runtime_backend, pixel_values, return_pooled, clip_skip);
+ auto runner_ctx = get_context();
+
+ struct ggml_tensor* hidden_states = vision_model.forward(&runner_ctx, pixel_values, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states);
@@ -661,35 +685,82 @@ struct SD3CLIPEmbedder : public Conditioner {
SD3CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {})
+ const String2TensorStorage& tensor_storage_map = {})
: clip_g_tokenizer(0) {
- clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
- clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
- t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
+ bool use_clip_l = false;
+ bool use_clip_g = false;
+ bool use_t5 = false;
+ for (auto pair : tensor_storage_map) {
+ if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
+ use_clip_l = true;
+ } else if (pair.first.find("text_encoders.clip_g") != std::string::npos) {
+ use_clip_g = true;
+ } else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
+ use_t5 = true;
+ }
+ }
+ if (!use_clip_l && !use_clip_g && !use_t5) {
+ LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
+ return;
+ }
+ if (use_clip_l) {
+ clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+ }
+ if (use_clip_g) {
+ clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+ }
+ if (use_t5) {
+ t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+ }
}
- void get_param_tensors(std::map& tensors) {
- clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
- clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
- t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ void get_param_tensors(std::map& tensors) override {
+ if (clip_l) {
+ clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
+ }
+ if (clip_g) {
+ clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
+ }
+ if (t5) {
+ t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ }
}
- void alloc_params_buffer() {
- clip_l->alloc_params_buffer();
- clip_g->alloc_params_buffer();
- t5->alloc_params_buffer();
+ void alloc_params_buffer() override {
+ if (clip_l) {
+ clip_l->alloc_params_buffer();
+ }
+ if (clip_g) {
+ clip_g->alloc_params_buffer();
+ }
+ if (t5) {
+ t5->alloc_params_buffer();
+ }
}
- void free_params_buffer() {
- clip_l->free_params_buffer();
- clip_g->free_params_buffer();
- t5->free_params_buffer();
+ void free_params_buffer() override {
+ if (clip_l) {
+ clip_l->free_params_buffer();
+ }
+ if (clip_g) {
+ clip_g->free_params_buffer();
+ }
+ if (t5) {
+ t5->free_params_buffer();
+ }
}
- size_t get_params_buffer_size() {
- size_t buffer_size = clip_l->get_params_buffer_size();
- buffer_size += clip_g->get_params_buffer_size();
- buffer_size += t5->get_params_buffer_size();
+ size_t get_params_buffer_size() override {
+ size_t buffer_size = 0;
+ if (clip_l) {
+ buffer_size += clip_l->get_params_buffer_size();
+ }
+ if (clip_g) {
+ buffer_size += clip_g->get_params_buffer_size();
+ }
+ if (t5) {
+ buffer_size += t5->get_params_buffer_size();
+ }
return buffer_size;
}
@@ -721,23 +792,32 @@ struct SD3CLIPEmbedder : public Conditioner {
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
-
- std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
- clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
-
- curr_tokens = clip_g_tokenizer.encode(curr_text, on_new_token_cb);
- clip_g_tokens.insert(clip_g_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight);
-
- curr_tokens = t5_tokenizer.Encode(curr_text, true);
- t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ if (clip_l) {
+ std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
+ clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
+ }
+ if (clip_g) {
+ std::vector curr_tokens = clip_g_tokenizer.encode(curr_text, on_new_token_cb);
+ clip_g_tokens.insert(clip_g_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight);
+ }
+ if (t5) {
+ std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true);
+ t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ }
}
- clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding);
- clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding);
- t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding);
+ if (clip_l) {
+ clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding);
+ }
+ if (clip_g) {
+ clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding);
+ }
+ if (t5) {
+ t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding);
+ }
// for (int i = 0; i < clip_l_tokens.size(); i++) {
// std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
@@ -774,21 +854,21 @@ struct SD3CLIPEmbedder : public Conditioner {
}
int64_t t0 = ggml_time_ms();
- struct ggml_tensor* hidden_states = NULL; // [N, n_token*2, 4096]
- struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096]
- struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l]
- struct ggml_tensor* chunk_hidden_states_g = NULL; // [n_token, hidden_size_g]
- struct ggml_tensor* chunk_hidden_states_t5 = NULL; // [n_token, hidden_size_t5]
- struct ggml_tensor* pooled = NULL;
- struct ggml_tensor* pooled_l = NULL; // [768,]
- struct ggml_tensor* pooled_g = NULL; // [1280,]
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token*2, 4096]
+ struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token*2, 4096]
+ struct ggml_tensor* chunk_hidden_states_l = nullptr; // [n_token, hidden_size_l]
+ struct ggml_tensor* chunk_hidden_states_g = nullptr; // [n_token, hidden_size_g]
+ struct ggml_tensor* chunk_hidden_states_t5 = nullptr; // [n_token, hidden_size_t5]
+ struct ggml_tensor* pooled = nullptr;
+ struct ggml_tensor* pooled_l = nullptr; // [768,]
+ struct ggml_tensor* pooled_g = nullptr; // [1280,]
std::vector hidden_states_vec;
size_t chunk_len = 77;
- size_t chunk_count = clip_l_tokens.size() / chunk_len;
+ size_t chunk_count = std::max(std::max(clip_l_tokens.size(), clip_g_tokens.size()), t5_tokens.size()) / chunk_len;
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
// clip_l
- {
+ if (clip_l) {
std::vector chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len,
clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len,
@@ -800,7 +880,7 @@ struct SD3CLIPEmbedder : public Conditioner {
clip_l->compute(n_threads,
input_ids,
0,
- NULL,
+ nullptr,
max_token_idx,
false,
clip_skip,
@@ -808,18 +888,18 @@ struct SD3CLIPEmbedder : public Conditioner {
work_ctx);
{
auto tensor = chunk_hidden_states_l;
- float original_mean = ggml_tensor_mean(tensor);
+ float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(tensor);
- ggml_tensor_scale(tensor, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
if (chunk_idx == 0) {
@@ -828,17 +908,24 @@ struct SD3CLIPEmbedder : public Conditioner {
clip_l->compute(n_threads,
input_ids,
0,
- NULL,
+ nullptr,
max_token_idx,
true,
clip_skip,
&pooled_l,
work_ctx);
}
+ } else {
+ chunk_hidden_states_l = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, chunk_len);
+ ggml_set_f32(chunk_hidden_states_l, 0.f);
+ if (chunk_idx == 0) {
+ pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
+ ggml_set_f32(pooled_l, 0.f);
+ }
}
// clip_g
- {
+ if (clip_g) {
std::vector chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len,
clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len,
@@ -850,7 +937,7 @@ struct SD3CLIPEmbedder : public Conditioner {
clip_g->compute(n_threads,
input_ids,
0,
- NULL,
+ nullptr,
max_token_idx,
false,
clip_skip,
@@ -859,18 +946,18 @@ struct SD3CLIPEmbedder : public Conditioner {
{
auto tensor = chunk_hidden_states_g;
- float original_mean = ggml_tensor_mean(tensor);
+ float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(tensor);
- ggml_tensor_scale(tensor, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
if (chunk_idx == 0) {
@@ -879,17 +966,24 @@ struct SD3CLIPEmbedder : public Conditioner {
clip_g->compute(n_threads,
input_ids,
0,
- NULL,
+ nullptr,
max_token_idx,
true,
clip_skip,
&pooled_g,
work_ctx);
}
+ } else {
+ chunk_hidden_states_g = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 1280, chunk_len);
+ ggml_set_f32(chunk_hidden_states_g, 0.f);
+ if (chunk_idx == 0) {
+ pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280);
+ ggml_set_f32(pooled_g, 0.f);
+ }
}
// t5
- {
+ if (t5) {
std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
@@ -899,24 +993,27 @@ struct SD3CLIPEmbedder : public Conditioner {
t5->compute(n_threads,
input_ids,
- NULL,
+ nullptr,
&chunk_hidden_states_t5,
work_ctx);
{
auto tensor = chunk_hidden_states_t5;
- float original_mean = ggml_tensor_mean(tensor);
+ float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(tensor);
- ggml_tensor_scale(tensor, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
+ } else {
+ chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
+ ggml_set_f32(chunk_hidden_states_t5, 0.f);
}
auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx,
@@ -930,19 +1027,19 @@ struct SD3CLIPEmbedder : public Conditioner {
for (int i0 = 0; i0 < chunk_hidden_states_lg_pad->ne[0]; i0++) {
float value = 0.f;
if (i0 < chunk_hidden_states_l->ne[0]) {
- value = ggml_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2);
+ value = ggml_ext_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2);
} else if (i0 < chunk_hidden_states_l->ne[0] + chunk_hidden_states_g->ne[0]) {
- value = ggml_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2);
+ value = ggml_ext_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2);
}
- ggml_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2);
}
}
}
- chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096]
+ chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096]
if (chunk_idx == 0) {
- pooled = ggml_tensor_concat(work_ctx, pooled_l, pooled_g, 0); // [768 + 1280]
+ pooled = ggml_ext_tensor_concat(work_ctx, pooled_l, pooled_g, 0); // [768 + 1280]
}
int64_t t1 = ggml_time_ms();
@@ -959,41 +1056,32 @@ struct SD3CLIPEmbedder : public Conditioner {
((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
}
- hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
- hidden_states = ggml_reshape_2d(work_ctx,
- hidden_states,
- chunk_hidden_states->ne[0],
- ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
- return SDCondition(hidden_states, pooled, NULL);
+ if (hidden_states_vec.size() > 0) {
+ hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
+ hidden_states = ggml_reshape_2d(work_ctx,
+ hidden_states,
+ chunk_hidden_states->ne[0],
+ ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
+ } else {
+ hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
+ ggml_set_f32(hidden_states, 0.f);
+ }
+ if (pooled == nullptr) {
+ pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2048);
+ ggml_set_f32(pooled, 0.f);
+ }
+ return {hidden_states, pooled, nullptr};
}
SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- auto tokens_and_weights = tokenize(text, 77, true);
- return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
- }
-
- std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
- int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int num_input_imgs,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- GGML_ASSERT(0 && "Not implemented yet!");
- }
-
- std::string remove_trigger_from_prompt(ggml_context* work_ctx,
- const std::string& prompt) {
- GGML_ASSERT(0 && "Not implemented yet!");
+ const ConditionerParams& conditioner_params) override {
+ auto tokens_and_weights = tokenize(conditioner_params.text, 77, true);
+ return get_learned_condition_common(work_ctx,
+ n_threads,
+ tokens_and_weights,
+ conditioner_params.clip_skip,
+ conditioner_params.zero_out_masked);
}
};
@@ -1006,29 +1094,69 @@ struct FluxCLIPEmbedder : public Conditioner {
FluxCLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {}) {
- clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
- t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
+ const String2TensorStorage& tensor_storage_map = {}) {
+ bool use_clip_l = false;
+ bool use_t5 = false;
+ for (auto pair : tensor_storage_map) {
+ if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
+ use_clip_l = true;
+ } else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
+ use_t5 = true;
+ }
+ }
+
+ if (!use_clip_l && !use_t5) {
+ LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
+ return;
+ }
+
+ if (use_clip_l) {
+ clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
+ } else {
+ LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
+ }
+ if (use_t5) {
+ t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+ } else {
+ LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
+ }
}
- void get_param_tensors(std::map& tensors) {
- clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
- t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ void get_param_tensors(std::map& tensors) override {
+ if (clip_l) {
+ clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
+ }
+ if (t5) {
+ t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ }
}
- void alloc_params_buffer() {
- clip_l->alloc_params_buffer();
- t5->alloc_params_buffer();
+ void alloc_params_buffer() override {
+ if (clip_l) {
+ clip_l->alloc_params_buffer();
+ }
+ if (t5) {
+ t5->alloc_params_buffer();
+ }
}
- void free_params_buffer() {
- clip_l->free_params_buffer();
- t5->free_params_buffer();
+ void free_params_buffer() override {
+ if (clip_l) {
+ clip_l->free_params_buffer();
+ }
+ if (t5) {
+ t5->free_params_buffer();
+ }
}
- size_t get_params_buffer_size() {
- size_t buffer_size = clip_l->get_params_buffer_size();
- buffer_size += t5->get_params_buffer_size();
+ size_t get_params_buffer_size() override {
+ size_t buffer_size = 0;
+ if (clip_l) {
+ buffer_size += clip_l->get_params_buffer_size();
+ }
+ if (t5) {
+ buffer_size += t5->get_params_buffer_size();
+ }
return buffer_size;
}
@@ -1058,18 +1186,24 @@ struct FluxCLIPEmbedder : public Conditioner {
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
-
- std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
- clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
-
- curr_tokens = t5_tokenizer.Encode(curr_text, true);
- t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ if (clip_l) {
+ std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
+ clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
+ }
+ if (t5) {
+ std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true);
+ t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ }
}
- clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding);
- t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding);
+ if (clip_l) {
+ clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding);
+ }
+ if (t5) {
+ t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding);
+ }
// for (int i = 0; i < clip_l_tokens.size(); i++) {
// std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
@@ -1099,40 +1233,42 @@ struct FluxCLIPEmbedder : public Conditioner {
}
int64_t t0 = ggml_time_ms();
- struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
- struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096]
- struct ggml_tensor* pooled = NULL; // [768,]
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096]
+ struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096]
+ struct ggml_tensor* pooled = nullptr; // [768,]
std::vector hidden_states_vec;
- size_t chunk_count = t5_tokens.size() / chunk_len;
+ size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len;
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
// clip_l
if (chunk_idx == 0) {
- size_t chunk_len_l = 77;
- std::vector chunk_tokens(clip_l_tokens.begin(),
- clip_l_tokens.begin() + chunk_len_l);
- std::vector chunk_weights(clip_l_weights.begin(),
- clip_l_weights.begin() + chunk_len_l);
+ if (clip_l) {
+ size_t chunk_len_l = 77;
+ std::vector chunk_tokens(clip_l_tokens.begin(),
+ clip_l_tokens.begin() + chunk_len_l);
+ std::vector chunk_weights(clip_l_weights.begin(),
+ clip_l_weights.begin() + chunk_len_l);
- auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
- size_t max_token_idx = 0;
+ auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+ size_t max_token_idx = 0;
- auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
- max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+ auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
+ max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
- clip_l->compute(n_threads,
- input_ids,
- 0,
- NULL,
- max_token_idx,
- true,
- clip_skip,
- &pooled,
- work_ctx);
+ clip_l->compute(n_threads,
+ input_ids,
+ 0,
+ nullptr,
+ max_token_idx,
+ true,
+ clip_skip,
+ &pooled,
+ work_ctx);
+ }
}
// t5
- {
+ if (t5) {
std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
@@ -1142,24 +1278,27 @@ struct FluxCLIPEmbedder : public Conditioner {
t5->compute(n_threads,
input_ids,
- NULL,
+ nullptr,
&chunk_hidden_states,
work_ctx);
{
auto tensor = chunk_hidden_states;
- float original_mean = ggml_tensor_mean(tensor);
+ float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(tensor);
- ggml_tensor_scale(tensor, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
+ } else {
+ chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
+ ggml_set_f32(chunk_hidden_states, 0.f);
}
int64_t t1 = ggml_time_ms();
@@ -1176,41 +1315,32 @@ struct FluxCLIPEmbedder : public Conditioner {
((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
}
- hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
- hidden_states = ggml_reshape_2d(work_ctx,
- hidden_states,
- chunk_hidden_states->ne[0],
- ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
- return SDCondition(hidden_states, pooled, NULL);
+ if (hidden_states_vec.size() > 0) {
+ hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
+ hidden_states = ggml_reshape_2d(work_ctx,
+ hidden_states,
+ chunk_hidden_states->ne[0],
+ ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
+ } else {
+ hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
+ ggml_set_f32(hidden_states, 0.f);
+ }
+ if (pooled == nullptr) {
+ pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
+ ggml_set_f32(pooled, 0.f);
+ }
+ return {hidden_states, pooled, nullptr};
}
SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- auto tokens_and_weights = tokenize(text, chunk_len, true);
- return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
- }
-
- std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
- int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int num_input_imgs,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- GGML_ASSERT(0 && "Not implemented yet!");
- }
-
- std::string remove_trigger_from_prompt(ggml_context* work_ctx,
- const std::string& prompt) {
- GGML_ASSERT(0 && "Not implemented yet!");
+ const ConditionerParams& conditioner_params) override {
+ auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
+ return get_learned_condition_common(work_ctx,
+ n_threads,
+ tokens_and_weights,
+ conditioner_params.clip_skip,
+ conditioner_params.zero_out_masked);
}
};
@@ -1224,31 +1354,49 @@ struct T5CLIPEmbedder : public Conditioner {
T5CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {},
- bool use_mask = false,
- int mask_pad = 1,
- bool is_umt5 = false)
+ const String2TensorStorage& tensor_storage_map = {},
+ bool use_mask = false,
+ int mask_pad = 1,
+ bool is_umt5 = false)
: use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
- t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
+ bool use_t5 = false;
+ for (auto pair : tensor_storage_map) {
+ if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
+ use_t5 = true;
+ }
+ }
+
+ if (!use_t5) {
+ LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
+ return;
+ } else {
+ t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5);
+ }
}
- void get_param_tensors(std::map& tensors) {
- t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ void get_param_tensors(std::map& tensors) override {
+ if (t5) {
+ t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ }
}
- void alloc_params_buffer() {
- t5->alloc_params_buffer();
+ void alloc_params_buffer() override {
+ if (t5) {
+ t5->alloc_params_buffer();
+ }
}
- void free_params_buffer() {
- t5->free_params_buffer();
+ void free_params_buffer() override {
+ if (t5) {
+ t5->free_params_buffer();
+ }
}
- size_t get_params_buffer_size() {
+ size_t get_params_buffer_size() override {
size_t buffer_size = 0;
-
- buffer_size += t5->get_params_buffer_size();
-
+ if (t5) {
+ buffer_size += t5->get_params_buffer_size();
+ }
return buffer_size;
}
@@ -1274,17 +1422,18 @@ struct T5CLIPEmbedder : public Conditioner {
std::vector t5_tokens;
std::vector t5_weights;
std::vector t5_mask;
- for (const auto& item : parsed_attention) {
- const std::string& curr_text = item.first;
- float curr_weight = item.second;
-
- std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true);
- t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
- }
+ if (t5) {
+ for (const auto& item : parsed_attention) {
+ const std::string& curr_text = item.first;
+ float curr_weight = item.second;
- t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding);
+ std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true);
+ t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ }
+ t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding);
+ }
return {t5_tokens, t5_weights, t5_mask};
}
@@ -1308,14 +1457,21 @@ struct T5CLIPEmbedder : public Conditioner {
std::tuple, std::vector, std::vector> token_and_weights,
int clip_skip,
bool zero_out_masked = false) {
+ if (!t5) {
+ auto hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
+ ggml_set_f32(hidden_states, 0.f);
+ auto t5_attn_mask = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 256);
+ ggml_set_f32(t5_attn_mask, -HUGE_VALF);
+ return {hidden_states, t5_attn_mask, nullptr};
+ }
auto& t5_tokens = std::get<0>(token_and_weights);
auto& t5_weights = std::get<1>(token_and_weights);
auto& t5_attn_mask_vec = std::get<2>(token_and_weights);
int64_t t0 = ggml_time_ms();
- struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
- struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096]
- struct ggml_tensor* pooled = NULL;
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096]
+ struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096]
+ struct ggml_tensor* pooled = nullptr;
struct ggml_tensor* t5_attn_mask = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec); // [n_token]
std::vector hidden_states_vec;
@@ -1332,7 +1488,7 @@ struct T5CLIPEmbedder : public Conditioner {
t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len);
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
- auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : NULL;
+ auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : nullptr;
t5->compute(n_threads,
input_ids,
@@ -1341,18 +1497,18 @@ struct T5CLIPEmbedder : public Conditioner {
work_ctx);
{
auto tensor = chunk_hidden_states;
- float original_mean = ggml_tensor_mean(tensor);
+ float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(tensor);
- ggml_tensor_scale(tensor, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
int64_t t1 = ggml_time_ms();
@@ -1363,7 +1519,7 @@ struct T5CLIPEmbedder : public Conditioner {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
if (chunk_mask[i1] < 0.f) {
- ggml_tensor_set_f32(tensor, 0.f, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, 0.f, i0, i1, i2);
}
}
}
@@ -1384,36 +1540,219 @@ struct T5CLIPEmbedder : public Conditioner {
modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
- return SDCondition(hidden_states, t5_attn_mask, NULL);
+ return {hidden_states, t5_attn_mask, nullptr};
}
SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- auto tokens_and_weights = tokenize(text, chunk_len, true);
- return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
+ const ConditionerParams& conditioner_params) override {
+ auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
+ return get_learned_condition_common(work_ctx,
+ n_threads,
+ tokens_and_weights,
+ conditioner_params.clip_skip,
+ conditioner_params.zero_out_masked);
}
+};
- std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
- int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int num_input_imgs,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- GGML_ASSERT(0 && "Not implemented yet!");
+struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
+ Qwen::Qwen2Tokenizer tokenizer;
+ std::shared_ptr qwenvl;
+
+ Qwen2_5_VLCLIPEmbedder(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map = {},
+ const std::string prefix = "",
+ bool enable_vision = false) {
+ qwenvl = std::make_shared(backend,
+ offload_params_to_cpu,
+ tensor_storage_map,
+ "text_encoders.qwen2vl",
+ enable_vision);
}
- std::string remove_trigger_from_prompt(ggml_context* work_ctx,
- const std::string& prompt) {
- GGML_ASSERT(0 && "Not implemented yet!");
+ void get_param_tensors(std::map& tensors) override {
+ qwenvl->get_param_tensors(tensors, "text_encoders.qwen2vl");
+ }
+
+ void alloc_params_buffer() override {
+ qwenvl->alloc_params_buffer();
+ }
+
+ void free_params_buffer() override {
+ qwenvl->free_params_buffer();
+ }
+
+ size_t get_params_buffer_size() override {
+ size_t buffer_size = 0;
+ buffer_size += qwenvl->get_params_buffer_size();
+ return buffer_size;
+ }
+
+ std::tuple, std::vector> tokenize(std::string text,
+ size_t max_length = 0,
+ size_t system_prompt_length = 0,
+ bool padding = false) {
+ std::vector> parsed_attention;
+ if (system_prompt_length > 0) {
+ parsed_attention.emplace_back(text.substr(0, system_prompt_length), 1.f);
+ auto new_parsed_attention = parse_prompt_attention(text.substr(system_prompt_length, text.size() - system_prompt_length));
+ parsed_attention.insert(parsed_attention.end(),
+ new_parsed_attention.begin(),
+ new_parsed_attention.end());
+ } else {
+ parsed_attention = parse_prompt_attention(text);
+ }
+
+ {
+ std::stringstream ss;
+ ss << "[";
+ for (const auto& item : parsed_attention) {
+ ss << "['" << item.first << "', " << item.second << "], ";
+ }
+ ss << "]";
+ LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+ }
+
+ std::vector tokens;
+ std::vector weights;
+ for (const auto& item : parsed_attention) {
+ const std::string& curr_text = item.first;
+ float curr_weight = item.second;
+ std::vector curr_tokens = tokenizer.tokenize(curr_text, nullptr);
+ tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+ }
+
+ tokenizer.pad_tokens(tokens, weights, max_length, padding);
+
+ // for (int i = 0; i < tokens.size(); i++) {
+ // std::cout << tokens[i] << ":" << weights[i] << ", " << i << std::endl;
+ // }
+ // std::cout << std::endl;
+
+ return {tokens, weights};
+ }
+
+ SDCondition get_learned_condition(ggml_context* work_ctx,
+ int n_threads,
+ const ConditionerParams& conditioner_params) override {
+ std::string prompt;
+ std::vector> image_embeds;
+ size_t system_prompt_length = 0;
+ int prompt_template_encode_start_idx = 34;
+ if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) {
+ LOG_INFO("QwenImageEditPlusPipeline");
+ prompt_template_encode_start_idx = 64;
+ int image_embed_idx = 64 + 6;
+
+ int min_pixels = 384 * 384;
+ int max_pixels = 560 * 560;
+ std::string placeholder = "<|image_pad|>";
+ std::string img_prompt;
+
+ for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
+ sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
+ double factor = qwenvl->params.vision.patch_size * qwenvl->params.vision.spatial_merge_size;
+ int height = image.height;
+ int width = image.width;
+ int h_bar = static_cast(std::round(height / factor)) * factor;
+ int w_bar = static_cast(std::round(width / factor)) * factor;
+
+ if (static_cast(h_bar) * w_bar > max_pixels) {
+ double beta = std::sqrt((height * width) / static_cast(max_pixels));
+ h_bar = std::max(static_cast(factor),
+ static_cast(std::floor(height / beta / factor)) * static_cast(factor));
+ w_bar = std::max(static_cast(factor),
+ static_cast(std::floor(width / beta / factor)) * static_cast(factor));
+ } else if (static_cast(h_bar) * w_bar < min_pixels) {
+ double beta = std::sqrt(static_cast(min_pixels) / (height * width));
+ h_bar = static_cast(std::ceil(height * beta / factor)) * static_cast(factor);
+ w_bar = static_cast(std::ceil(width * beta / factor)) * static_cast(factor);
+ }
+
+ LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
+
+ sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
+ free(image.data);
+ image.data = nullptr;
+
+ ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
+ sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
+ free(resized_image.data);
+ resized_image.data = nullptr;
+
+ ggml_tensor* image_embed = nullptr;
+ qwenvl->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
+ image_embeds.emplace_back(image_embed_idx, image_embed);
+ image_embed_idx += 1 + image_embed->ne[1] + 6;
+
+ img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652]
+ int64_t num_image_tokens = image_embed->ne[1];
+ img_prompt.reserve(num_image_tokens * placeholder.size());
+ for (int j = 0; j < num_image_tokens; j++) {
+ img_prompt += placeholder;
+ }
+ img_prompt += "<|vision_end|>";
+ }
+
+ prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
+
+ system_prompt_length = prompt.size();
+
+ prompt += img_prompt;
+ prompt += conditioner_params.text;
+ prompt += "<|im_end|>\n<|im_start|>assistant\n";
+ } else {
+ prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n";
+ }
+
+ auto tokens_and_weights = tokenize(prompt, 0, system_prompt_length, false);
+ auto& tokens = std::get<0>(tokens_and_weights);
+ auto& weights = std::get<1>(tokens_and_weights);
+
+ int64_t t0 = ggml_time_ms();
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 3584]
+
+ auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
+
+ qwenvl->compute(n_threads,
+ input_ids,
+ image_embeds,
+ &hidden_states,
+ work_ctx);
+ {
+ auto tensor = hidden_states;
+ float original_mean = ggml_ext_tensor_mean(tensor);
+ for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
+ value *= weights[i1];
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
+ }
+ }
+ }
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
+ }
+
+ GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
+
+ ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
+ GGML_TYPE_F32,
+ hidden_states->ne[0],
+ hidden_states->ne[1] - prompt_template_encode_start_idx,
+ hidden_states->ne[2]);
+
+ ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+ float value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
+ ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
+ });
+
+ int64_t t1 = ggml_time_ms();
+ LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
+ return {new_hidden_states, nullptr, nullptr};
}
};
diff --git a/control.hpp b/control.hpp
index 79b82a220..b34140efb 100644
--- a/control.hpp
+++ b/control.hpp
@@ -27,6 +27,7 @@ class ControlNetBlock : public GGMLBlock {
int num_heads = 8;
int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+ bool use_linear_projection = false;
public:
int model_channels = 320;
@@ -82,7 +83,7 @@ class ControlNetBlock : public GGMLBlock {
int64_t d_head,
int64_t depth,
int64_t context_dim) -> SpatialTransformer* {
- return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
+ return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
};
auto make_zero_conv = [&](int64_t channels) {
@@ -165,7 +166,7 @@ class ControlNetBlock : public GGMLBlock {
}
struct ggml_tensor* resblock_forward(std::string name,
- struct ggml_context* ctx,
+ GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* emb) {
auto block = std::dynamic_pointer_cast(blocks[name]);
@@ -173,15 +174,14 @@ class ControlNetBlock : public GGMLBlock {
}
struct ggml_tensor* attention_layer_forward(std::string name,
- struct ggml_context* ctx,
- ggml_backend_t backend,
+ GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
auto block = std::dynamic_pointer_cast(blocks[name]);
- return block->forward(ctx, backend, x, context);
+ return block->forward(ctx, x, context);
}
- struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
+ struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
struct ggml_tensor* hint,
struct ggml_tensor* emb,
struct ggml_tensor* context) {
@@ -193,33 +193,32 @@ class ControlNetBlock : public GGMLBlock {
h = block->forward(ctx, h);
} else {
- h = ggml_silu_inplace(ctx, h);
+ h = ggml_silu_inplace(ctx->ggml_ctx, h);
}
}
return h;
}
- std::vector forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ std::vector forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* hint,
struct ggml_tensor* guided_hint,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
- struct ggml_tensor* y = NULL) {
+ struct ggml_tensor* y = nullptr) {
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
// timesteps: [N,]
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
// y: [N, adm_in_channels] or [1, adm_in_channels]
- if (context != NULL) {
+ if (context != nullptr) {
if (context->ne[2] != x->ne[3]) {
- context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
+ context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
}
}
- if (y != NULL) {
+ if (y != nullptr) {
if (y->ne[1] != x->ne[3]) {
- y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
+ y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
}
}
@@ -230,27 +229,27 @@ class ControlNetBlock : public GGMLBlock {
auto middle_block_out = std::dynamic_pointer_cast(blocks["middle_block_out.0"]);
- auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels]
+ auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels); // [N, model_channels]
auto emb = time_embed_0->forward(ctx, t_emb);
- emb = ggml_silu_inplace(ctx, emb);
+ emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
// SDXL/SVD
- if (y != NULL) {
+ if (y != nullptr) {
auto label_embed_0 = std::dynamic_pointer_cast(blocks["label_emb.0.0"]);
auto label_embed_2 = std::dynamic_pointer_cast(blocks["label_emb.0.2"]);
auto label_emb = label_embed_0->forward(ctx, y);
- label_emb = ggml_silu_inplace(ctx, label_emb);
+ label_emb = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
- emb = ggml_add(ctx, emb, label_emb); // [N, time_embed_dim]
+ emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim]
}
std::vector outs;
- if (guided_hint == NULL) {
+ if (guided_hint == nullptr) {
guided_hint = input_hint_block_forward(ctx, hint, emb, context);
}
outs.push_back(guided_hint);
@@ -259,7 +258,7 @@ class ControlNetBlock : public GGMLBlock {
// input block 0
auto h = input_blocks_0_0->forward(ctx, x);
- h = ggml_add(ctx, h, guided_hint);
+ h = ggml_add(ctx->ggml_ctx, h, guided_hint);
outs.push_back(zero_convs_0->forward(ctx, h));
// input block 1-11
@@ -274,7 +273,7 @@ class ControlNetBlock : public GGMLBlock {
h = resblock_forward(name, ctx, h, emb); // [N, mult*model_channels, h, w]
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
- h = attention_layer_forward(name, ctx, backend, h, context); // [N, mult*model_channels, h, w]
+ h = attention_layer_forward(name, ctx, h, context); // [N, mult*model_channels, h, w]
}
auto zero_conv = std::dynamic_pointer_cast(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
@@ -298,9 +297,9 @@ class ControlNetBlock : public GGMLBlock {
// [N, 4*model_channels, h/8, w/8]
// middle_block
- h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
- h = attention_layer_forward("middle_block.1", ctx, backend, h, context); // [N, 4*model_channels, h/8, w/8]
- h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
+ h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
+ h = attention_layer_forward("middle_block.1", ctx, h, context); // [N, 4*model_channels, h/8, w/8]
+ h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
// out
outs.push_back(middle_block_out->forward(ctx, h));
@@ -312,39 +311,28 @@ struct ControlNet : public GGMLRunner {
SDVersion version = VERSION_SD1;
ControlNetBlock control_net;
- ggml_backend_buffer_t control_buffer = NULL; // keep control output tensors in backend memory
- ggml_context* control_ctx = NULL;
+ ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory
+ ggml_context* control_ctx = nullptr;
std::vector controls; // (12 input block outputs, 1 middle block output) SD 1.5
- struct ggml_tensor* guided_hint = NULL; // guided_hint cache, for faster inference
+ struct ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference
bool guided_hint_cached = false;
ControlNet(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {},
- SDVersion version = VERSION_SD1)
+ const String2TensorStorage& tensor_storage_map = {},
+ SDVersion version = VERSION_SD1)
: GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
- control_net.init(params_ctx, tensor_types, "");
+ control_net.init(params_ctx, tensor_storage_map, "");
}
- void enable_conv2d_direct() {
- std::vector blocks;
- control_net.get_all_blocks(blocks);
- for (auto block : blocks) {
- if (block->get_desc() == "Conv2d") {
- auto conv_block = (Conv2d*)block;
- conv_block->enable_direct();
- }
- }
- }
-
- ~ControlNet() {
+ ~ControlNet() override {
free_control_ctx();
}
void alloc_control_ctx(std::vector outs) {
struct ggml_init_params params;
params.mem_size = static_cast(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
- params.mem_buffer = NULL;
+ params.mem_buffer = nullptr;
params.no_alloc = true;
control_ctx = ggml_init(params);
@@ -366,20 +354,20 @@ struct ControlNet : public GGMLRunner {
}
void free_control_ctx() {
- if (control_buffer != NULL) {
+ if (control_buffer != nullptr) {
ggml_backend_buffer_free(control_buffer);
- control_buffer = NULL;
+ control_buffer = nullptr;
}
- if (control_ctx != NULL) {
+ if (control_ctx != nullptr) {
ggml_free(control_ctx);
- control_ctx = NULL;
+ control_ctx = nullptr;
}
- guided_hint = NULL;
+ guided_hint = nullptr;
guided_hint_cached = false;
controls.clear();
}
- std::string get_desc() {
+ std::string get_desc() override {
return "control_net";
}
@@ -391,12 +379,12 @@ struct ControlNet : public GGMLRunner {
struct ggml_tensor* hint,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
- struct ggml_tensor* y = NULL) {
+ struct ggml_tensor* y = nullptr) {
struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
x = to_backend(x);
if (guided_hint_cached) {
- hint = NULL;
+ hint = nullptr;
} else {
hint = to_backend(hint);
}
@@ -404,16 +392,17 @@ struct ControlNet : public GGMLRunner {
y = to_backend(y);
timesteps = to_backend(timesteps);
- auto outs = control_net.forward(compute_ctx,
- runtime_backend,
+ auto runner_ctx = get_context();
+
+ auto outs = control_net.forward(&runner_ctx,
x,
hint,
- guided_hint_cached ? guided_hint : NULL,
+ guided_hint_cached ? guided_hint : nullptr,
timesteps,
context,
y);
- if (control_ctx == NULL) {
+ if (control_ctx == nullptr) {
alloc_control_ctx(outs);
}
@@ -431,8 +420,8 @@ struct ControlNet : public GGMLRunner {
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* y,
- struct ggml_tensor** output = NULL,
- struct ggml_context* output_ctx = NULL) {
+ struct ggml_tensor** output = nullptr,
+ struct ggml_context* output_ctx = nullptr) {
// x: [N, in_channels, h, w]
// timesteps: [N, ]
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
@@ -453,7 +442,7 @@ struct ControlNet : public GGMLRunner {
std::set ignore_tensors;
ModelLoader model_loader;
- if (!model_loader.init_from_file(file_path)) {
+ if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
return false;
}
diff --git a/denoiser.hpp b/denoiser.hpp
index 3c53301bc..5ff45bb2c 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -19,7 +19,7 @@ struct SigmaSchedule {
};
struct DiscreteSchedule : SigmaSchedule {
- std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+ std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector result;
int t_max = TIMESTEPS - 1;
@@ -43,7 +43,7 @@ struct DiscreteSchedule : SigmaSchedule {
};
struct ExponentialSchedule : SigmaSchedule {
- std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+ std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector sigmas;
// Calculate step size
@@ -150,7 +150,7 @@ std::vector log_linear_interpolation(std::vector sigma_in,
https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html
*/
struct AYSSchedule : SigmaSchedule {
- std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+ std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
const std::vector noise_levels[] = {
/* SD1.5 */
{14.6146412293f, 6.4745760956f, 3.8636745985f, 2.6946151520f,
@@ -204,7 +204,7 @@ struct AYSSchedule : SigmaSchedule {
* GITS Scheduler: https://github.com/zju-pi/diff-sampler/tree/main/gits-main
*/
struct GITSSchedule : SigmaSchedule {
- std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+ std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
if (sigma_max <= 0.0f) {
return std::vector{};
}
@@ -252,7 +252,7 @@ struct SGMUniformSchedule : SigmaSchedule {
};
struct KarrasSchedule : SigmaSchedule {
- std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+ std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
// These *COULD* be function arguments here,
// but does anybody ever bother to touch them?
float rho = 7.f;
@@ -350,15 +350,15 @@ struct CompVisDenoiser : public Denoiser {
float sigma_data = 1.0f;
- float sigma_min() {
+ float sigma_min() override {
return sigmas[0];
}
- float sigma_max() {
+ float sigma_max() override {
return sigmas[TIMESTEPS - 1];
}
- float sigma_to_t(float sigma) {
+ float sigma_to_t(float sigma) override {
float log_sigma = std::log(sigma);
std::vector dists;
dists.reserve(TIMESTEPS);
@@ -384,7 +384,7 @@ struct CompVisDenoiser : public Denoiser {
return t;
}
- float t_to_sigma(float t) {
+ float t_to_sigma(float t) override {
int low_idx = static_cast(std::floor(t));
int high_idx = static_cast(std::ceil(t));
float w = t - static_cast(low_idx);
@@ -392,7 +392,7 @@ struct CompVisDenoiser : public Denoiser {
return std::exp(log_sigma);
}
- std::vector get_scalings(float sigma) {
+ std::vector get_scalings(float sigma) override {
float c_skip = 1.0f;
float c_out = -sigma;
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
@@ -400,19 +400,19 @@ struct CompVisDenoiser : public Denoiser {
}
// this function will modify noise/latent
- ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) {
- ggml_tensor_scale(noise, sigma);
- ggml_tensor_add(latent, noise);
+ ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
+ ggml_ext_tensor_scale_inplace(noise, sigma);
+ ggml_ext_tensor_add_inplace(latent, noise);
return latent;
}
- ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) {
+ ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
return latent;
}
};
struct CompVisVDenoiser : public CompVisDenoiser {
- std::vector get_scalings(float sigma) {
+ std::vector get_scalings(float sigma) override {
float c_skip = sigma_data * sigma_data / (sigma * sigma + sigma_data * sigma_data);
float c_out = -sigma * sigma_data / std::sqrt(sigma * sigma + sigma_data * sigma_data);
float c_in = 1.0f / std::sqrt(sigma * sigma + sigma_data * sigma_data);
@@ -429,19 +429,19 @@ struct EDMVDenoiser : public CompVisVDenoiser {
scheduler = std::make_shared();
}
- float t_to_sigma(float t) {
+ float t_to_sigma(float t) override {
return std::exp(t * 4 / (float)TIMESTEPS);
}
- float sigma_to_t(float s) {
+ float sigma_to_t(float s) override {
return 0.25 * std::log(s);
}
- float sigma_min() {
+ float sigma_min() override {
return min_sigma;
}
- float sigma_max() {
+ float sigma_max() override {
return max_sigma;
}
};
@@ -470,24 +470,24 @@ struct DiscreteFlowDenoiser : public Denoiser {
}
}
- float sigma_min() {
+ float sigma_min() override {
return sigmas[0];
}
- float sigma_max() {
+ float sigma_max() override {
return sigmas[TIMESTEPS - 1];
}
- float sigma_to_t(float sigma) {
+ float sigma_to_t(float sigma) override {
return sigma * 1000.f;
}
- float t_to_sigma(float t) {
+ float t_to_sigma(float t) override {
t = t + 1;
return time_snr_shift(shift, t / 1000.f);
}
- std::vector get_scalings(float sigma) {
+ std::vector get_scalings(float sigma) override {
float c_skip = 1.0f;
float c_out = -sigma;
float c_in = 1.0f;
@@ -495,15 +495,15 @@ struct DiscreteFlowDenoiser : public Denoiser {
}
// this function will modify noise/latent
- ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) {
- ggml_tensor_scale(noise, sigma);
- ggml_tensor_scale(latent, 1.0f - sigma);
- ggml_tensor_add(latent, noise);
+ ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
+ ggml_ext_tensor_scale_inplace(noise, sigma);
+ ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
+ ggml_ext_tensor_add_inplace(latent, noise);
return latent;
}
- ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) {
- ggml_tensor_scale(latent, 1.0f / (1.0f - sigma));
+ ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
+ ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
return latent;
}
};
@@ -529,24 +529,24 @@ struct FluxFlowDenoiser : public Denoiser {
}
}
- float sigma_min() {
+ float sigma_min() override {
return sigmas[0];
}
- float sigma_max() {
+ float sigma_max() override {
return sigmas[TIMESTEPS - 1];
}
- float sigma_to_t(float sigma) {
+ float sigma_to_t(float sigma) override {
return sigma;
}
- float t_to_sigma(float t) {
+ float t_to_sigma(float t) override {
t = t + 1;
return flux_time_shift(shift, 1.0f, t / TIMESTEPS);
}
- std::vector get_scalings(float sigma) {
+ std::vector get_scalings(float sigma) override {
float c_skip = 1.0f;
float c_out = -sigma;
float c_in = 1.0f;
@@ -554,15 +554,15 @@ struct FluxFlowDenoiser : public Denoiser {
}
// this function will modify noise/latent
- ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) {
- ggml_tensor_scale(noise, sigma);
- ggml_tensor_scale(latent, 1.0f - sigma);
- ggml_tensor_add(latent, noise);
+ ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
+ ggml_ext_tensor_scale_inplace(noise, sigma);
+ ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
+ ggml_ext_tensor_add_inplace(latent, noise);
return latent;
}
- ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) {
- ggml_tensor_scale(latent, 1.0f / (1.0f - sigma));
+ ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
+ ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
return latent;
}
};
@@ -620,7 +620,7 @@ static void sample_k_diffusion(sample_method_t method,
if (sigmas[i + 1] > 0) {
// x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
- ggml_tensor_set_f32_randn(noise, rng);
+ ggml_ext_im_set_randn_f32(noise, rng);
// noise = load_tensor_from_file(work_ctx, "./rand" + std::to_string(i+1) + ".bin");
{
float* vec_x = (float*)x->data;
@@ -820,7 +820,7 @@ static void sample_k_diffusion(sample_method_t method,
// Noise addition
if (sigmas[i + 1] > 0) {
- ggml_tensor_set_f32_randn(noise, rng);
+ ggml_ext_im_set_randn_f32(noise, rng);
{
float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data;
@@ -1085,7 +1085,7 @@ static void sample_k_diffusion(sample_method_t method,
if (sigmas[i + 1] > 0) {
// x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1])
- ggml_tensor_set_f32_randn(noise, rng);
+ ggml_ext_im_set_randn_f32(noise, rng);
// noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
{
float* vec_x = (float*)x->data;
@@ -1276,7 +1276,7 @@ static void sample_k_diffusion(sample_method_t method,
}
}
if (eta > 0) {
- ggml_tensor_set_f32_randn(variance_noise, rng);
+ ggml_ext_im_set_randn_f32(variance_noise, rng);
float* vec_variance_noise =
(float*)variance_noise->data;
float* vec_x = (float*)x->data;
@@ -1444,7 +1444,7 @@ static void sample_k_diffusion(sample_method_t method,
if (eta > 0 && i != steps - 1) {
// In this case, x is still pred_noised_sample,
// continue in-place
- ggml_tensor_set_f32_randn(noise, rng);
+ ggml_ext_im_set_randn_f32(noise, rng);
float* vec_x = (float*)x->data;
float* vec_noise = (float*)noise->data;
for (int j = 0; j < ggml_nelements(x); j++) {
diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index 92d3da5ad..307049814 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -3,22 +3,23 @@
#include "flux.hpp"
#include "mmdit.hpp"
+#include "qwen_image.hpp"
#include "unet.hpp"
#include "wan.hpp"
struct DiffusionParams {
- struct ggml_tensor* x = NULL;
- struct ggml_tensor* timesteps = NULL;
- struct ggml_tensor* context = NULL;
- struct ggml_tensor* c_concat = NULL;
- struct ggml_tensor* y = NULL;
- struct ggml_tensor* guidance = NULL;
+ struct ggml_tensor* x = nullptr;
+ struct ggml_tensor* timesteps = nullptr;
+ struct ggml_tensor* context = nullptr;
+ struct ggml_tensor* c_concat = nullptr;
+ struct ggml_tensor* y = nullptr;
+ struct ggml_tensor* guidance = nullptr;
std::vector ref_latents = {};
bool increase_ref_index = false;
int num_video_frames = -1;
std::vector controls = {};
float control_strength = 0.f;
- struct ggml_tensor* vace_context = NULL;
+ struct ggml_tensor* vace_context = nullptr;
float vace_strength = 1.f;
std::vector skip_layers = {};
};
@@ -27,14 +28,15 @@ struct DiffusionModel {
virtual std::string get_desc() = 0;
virtual void compute(int n_threads,
DiffusionParams diffusion_params,
- struct ggml_tensor** output = NULL,
- struct ggml_context* output_ctx = NULL) = 0;
+ struct ggml_tensor** output = nullptr,
+ struct ggml_context* output_ctx = nullptr) = 0;
virtual void alloc_params_buffer() = 0;
virtual void free_params_buffer() = 0;
virtual void free_compute_buffer() = 0;
virtual void get_param_tensors(std::map& tensors) = 0;
virtual size_t get_params_buffer_size() = 0;
virtual int64_t get_adm_in_channels() = 0;
+ virtual void set_flash_attn_enabled(bool enabled) = 0;
};
struct UNetModel : public DiffusionModel {
@@ -42,44 +44,47 @@ struct UNetModel : public DiffusionModel {
UNetModel(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {},
- SDVersion version = VERSION_SD1,
- bool flash_attn = false)
- : unet(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn) {
+ const String2TensorStorage& tensor_storage_map = {},
+ SDVersion version = VERSION_SD1)
+ : unet(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version) {
}
- std::string get_desc() {
+ std::string get_desc() override {
return unet.get_desc();
}
- void alloc_params_buffer() {
+ void alloc_params_buffer() override {
unet.alloc_params_buffer();
}
- void free_params_buffer() {
+ void free_params_buffer() override {
unet.free_params_buffer();
}
- void free_compute_buffer() {
+ void free_compute_buffer() override {
unet.free_compute_buffer();
}
- void get_param_tensors(std::map& tensors) {
+ void get_param_tensors(std::map& tensors) override {
unet.get_param_tensors(tensors, "model.diffusion_model");
}
- size_t get_params_buffer_size() {
+ size_t get_params_buffer_size() override {
return unet.get_params_buffer_size();
}
- int64_t get_adm_in_channels() {
+ int64_t get_adm_in_channels() override {
return unet.unet.adm_in_channels;
}
+ void set_flash_attn_enabled(bool enabled) {
+ unet.set_flash_attention_enabled(enabled);
+ }
+
void compute(int n_threads,
DiffusionParams diffusion_params,
- struct ggml_tensor** output = NULL,
- struct ggml_context* output_ctx = NULL) {
+ struct ggml_tensor** output = nullptr,
+ struct ggml_context* output_ctx = nullptr) override {
return unet.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
@@ -97,43 +102,46 @@ struct MMDiTModel : public DiffusionModel {
MMDiTModel(ggml_backend_t backend,
bool offload_params_to_cpu,
- bool flash_attn = false,
- const String2GGMLType& tensor_types = {})
- : mmdit(backend, offload_params_to_cpu, flash_attn, tensor_types, "model.diffusion_model") {
+ const String2TensorStorage& tensor_storage_map = {})
+ : mmdit(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model") {
}
- std::string get_desc() {
+ std::string get_desc() override {
return mmdit.get_desc();
}
- void alloc_params_buffer() {
+ void alloc_params_buffer() override {
mmdit.alloc_params_buffer();
}
- void free_params_buffer() {
+ void free_params_buffer() override {
mmdit.free_params_buffer();
}
- void free_compute_buffer() {
+ void free_compute_buffer() override {
mmdit.free_compute_buffer();
}
- void get_param_tensors(std::map& tensors) {
+ void get_param_tensors(std::map& tensors) override {
mmdit.get_param_tensors(tensors, "model.diffusion_model");
}
- size_t get_params_buffer_size() {
+ size_t get_params_buffer_size() override {
return mmdit.get_params_buffer_size();
}
- int64_t get_adm_in_channels() {
+ int64_t get_adm_in_channels() override {
return 768 + 1280;
}
+ void set_flash_attn_enabled(bool enabled) {
+ mmdit.set_flash_attention_enabled(enabled);
+ }
+
void compute(int n_threads,
DiffusionParams diffusion_params,
- struct ggml_tensor** output = NULL,
- struct ggml_context* output_ctx = NULL) {
+ struct ggml_tensor** output = nullptr,
+ struct ggml_context* output_ctx = nullptr) override {
return mmdit.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
@@ -150,45 +158,48 @@ struct FluxModel : public DiffusionModel {
FluxModel(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {},
- SDVersion version = VERSION_FLUX,
- bool flash_attn = false,
- bool use_mask = false)
- : flux(backend, offload_params_to_cpu, tensor_types, "model.diffusion_model", version, flash_attn, use_mask) {
+ const String2TensorStorage& tensor_storage_map = {},
+ SDVersion version = VERSION_FLUX,
+ bool use_mask = false)
+ : flux(backend, offload_params_to_cpu, tensor_storage_map, "model.diffusion_model", version, use_mask) {
}
- std::string get_desc() {
+ std::string get_desc() override {
return flux.get_desc();
}
- void alloc_params_buffer() {
+ void alloc_params_buffer() override {
flux.alloc_params_buffer();
}
- void free_params_buffer() {
+ void free_params_buffer() override {
flux.free_params_buffer();
}
- void free_compute_buffer() {
+ void free_compute_buffer() override {
flux.free_compute_buffer();
}
- void get_param_tensors(std::map& tensors) {
+ void get_param_tensors(std::map& tensors) override {
flux.get_param_tensors(tensors, "model.diffusion_model");
}
- size_t get_params_buffer_size() {
+ size_t get_params_buffer_size() override {
return flux.get_params_buffer_size();
}
- int64_t get_adm_in_channels() {
+ int64_t get_adm_in_channels() override {
return 768;
}
+ void set_flash_attn_enabled(bool enabled) {
+ flux.set_flash_attention_enabled(enabled);
+ }
+
void compute(int n_threads,
DiffusionParams diffusion_params,
- struct ggml_tensor** output = NULL,
- struct ggml_context* output_ctx = NULL) {
+ struct ggml_tensor** output = nullptr,
+ struct ggml_context* output_ctx = nullptr) override {
return flux.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
@@ -210,52 +221,55 @@ struct WanModel : public DiffusionModel {
WanModel(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {},
- const std::string prefix = "model.diffusion_model",
- SDVersion version = VERSION_WAN2,
- bool flash_attn = false)
- : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_types, prefix, version, flash_attn) {
+ const String2TensorStorage& tensor_storage_map = {},
+ const std::string prefix = "model.diffusion_model",
+ SDVersion version = VERSION_WAN2)
+ : prefix(prefix), wan(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
}
- std::string get_desc() {
+ std::string get_desc() override {
return wan.get_desc();
}
- void alloc_params_buffer() {
+ void alloc_params_buffer() override {
wan.alloc_params_buffer();
}
- void free_params_buffer() {
+ void free_params_buffer() override {
wan.free_params_buffer();
}
- void free_compute_buffer() {
+ void free_compute_buffer() override {
wan.free_compute_buffer();
}
- void get_param_tensors(std::map& tensors) {
+ void get_param_tensors(std::map& tensors) override {
wan.get_param_tensors(tensors, prefix);
}
- size_t get_params_buffer_size() {
+ size_t get_params_buffer_size() override {
return wan.get_params_buffer_size();
}
- int64_t get_adm_in_channels() {
+ int64_t get_adm_in_channels() override {
return 768;
}
+ void set_flash_attn_enabled(bool enabled) {
+ wan.set_flash_attention_enabled(enabled);
+ }
+
void compute(int n_threads,
DiffusionParams diffusion_params,
- struct ggml_tensor** output = NULL,
- struct ggml_context* output_ctx = NULL) {
+ struct ggml_tensor** output = nullptr,
+ struct ggml_context* output_ctx = nullptr) override {
return wan.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.y,
diffusion_params.c_concat,
- NULL,
+ nullptr,
diffusion_params.vace_context,
diffusion_params.vace_strength,
output,
@@ -263,4 +277,63 @@ struct WanModel : public DiffusionModel {
}
};
+struct QwenImageModel : public DiffusionModel {
+ std::string prefix;
+ Qwen::QwenImageRunner qwen_image;
+
+ QwenImageModel(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map = {},
+ const std::string prefix = "model.diffusion_model",
+ SDVersion version = VERSION_QWEN_IMAGE)
+ : prefix(prefix), qwen_image(backend, offload_params_to_cpu, tensor_storage_map, prefix, version) {
+ }
+
+ std::string get_desc() override {
+ return qwen_image.get_desc();
+ }
+
+ void alloc_params_buffer() override {
+ qwen_image.alloc_params_buffer();
+ }
+
+ void free_params_buffer() override {
+ qwen_image.free_params_buffer();
+ }
+
+ void free_compute_buffer() override {
+ qwen_image.free_compute_buffer();
+ }
+
+ void get_param_tensors(std::map& tensors) override {
+ qwen_image.get_param_tensors(tensors, prefix);
+ }
+
+ size_t get_params_buffer_size() override {
+ return qwen_image.get_params_buffer_size();
+ }
+
+ int64_t get_adm_in_channels() override {
+ return 768;
+ }
+
+ void set_flash_attn_enabled(bool enabled) {
+ qwen_image.set_flash_attention_enabled(enabled);
+ }
+
+ void compute(int n_threads,
+ DiffusionParams diffusion_params,
+ struct ggml_tensor** output = nullptr,
+ struct ggml_context* output_ctx = nullptr) override {
+ return qwen_image.compute(n_threads,
+ diffusion_params.x,
+ diffusion_params.timesteps,
+ diffusion_params.context,
+ diffusion_params.ref_latents,
+ true, // increase_ref_index
+ output,
+ output_ctx);
+ }
+};
+
#endif
diff --git a/docs/build.md b/docs/build.md
new file mode 100644
index 000000000..1ba582d9f
--- /dev/null
+++ b/docs/build.md
@@ -0,0 +1,173 @@
+# Build from scratch
+
+## Get the Code
+
+```
+git clone --recursive https://github.com/leejet/stable-diffusion.cpp
+cd stable-diffusion.cpp
+```
+
+- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
+
+```
+cd stable-diffusion.cpp
+git pull origin master
+git submodule init
+git submodule update
+```
+
+## Build (CPU only)
+
+If you don't have a GPU or CUDA installed, you can build a CPU-only version.
+
+```shell
+mkdir build && cd build
+cmake ..
+cmake --build . --config Release
+```
+
+## Build with OpenBLAS
+
+```shell
+mkdir build && cd build
+cmake .. -DGGML_OPENBLAS=ON
+cmake --build . --config Release
+```
+
+## Build with CUDA
+
+This provides GPU acceleration using NVIDIA GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
+
+```shell
+mkdir build && cd build
+cmake .. -DSD_CUDA=ON
+cmake --build . --config Release
+```
+
+## Build with HipBLAS
+
+This provides GPU acceleration using AMD GPU. Make sure to have the ROCm toolkit installed.
+To build for another GPU architecture than installed in your system, set `$GFX_NAME` manually to the desired architecture (replace first command). This is also necessary if your GPU is not officially supported by ROCm, for example you have to set `$GFX_NAME` manually to `gfx1030` for consumer RDNA2 cards.
+
+Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
+
+```shell
+mkdir build && cd build
+if command -v rocminfo; then export GFX_NAME=$(rocminfo | awk '/ *Name: +gfx[1-9]/ {print $2; exit}'); else echo "rocminfo missing!"; fi
+if [ -z "${GFX_NAME}" ]; then echo "Error: Couldn't detect GPU!"; else echo "Building for GPU: ${GFX_NAME}"; fi
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DAMDGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+cmake --build . --config Release
+```
+
+## Build with MUSA
+
+This provides GPU acceleration using Moore Threads GPU. Make sure to have the MUSA toolkit installed.
+
+```shell
+mkdir build && cd build
+cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build . --config Release
+```
+
+## Build with Metal
+
+Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
+
+```shell
+mkdir build && cd build
+cmake .. -DSD_METAL=ON
+cmake --build . --config Release
+```
+
+## Build with Vulkan
+
+Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
+
+```shell
+mkdir build && cd build
+cmake .. -DSD_VULKAN=ON
+cmake --build . --config Release
+```
+
+## Build with OpenCL (for Adreno GPU)
+
+Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
+
+To build for Windows ARM please refers to [Windows 11 Arm64](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
+
+Building for Android:
+
+ Android NDK:
+ Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
+
+Setup OpenCL Dependencies for NDK:
+
+You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
+
+* OpenCL Headers:
+ ```bash
+ # In a temporary working directory
+ git clone https://github.com/KhronosGroup/OpenCL-Headers
+ cd OpenCL-Headers
+ # Replace with your actual NDK installation path
+ # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+ sudo cp -r CL /toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
+ cd ..
+ ```
+
+* OpenCL ICD Loader:
+ ```shell
+ # In the same temporary working directory
+ git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
+ cd OpenCL-ICD-Loader
+ mkdir build_ndk && cd build_ndk
+
+ # Replace in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
+ cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_TOOLCHAIN_FILE=/build/cmake/android.toolchain.cmake \
+ -DOPENCL_ICD_LOADER_HEADERS_DIR=