diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000..63924a0ba
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,10 @@
+Checks: >
+ modernize-make-shared,
+ modernize-use-nullptr,
+ modernize-use-override,
+ modernize-pass-by-value,
+ modernize-return-braced-init-list,
+ modernize-deprecated-headers,
+HeaderFilterRegex: '^$'
+WarningsAsErrors: ''
+FormatStyle: none
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 000000000..5affda3bc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,73 @@
+name: 🐞 Bug Report
+description: Report a bug or unexpected behavior
+title: "[Bug] "
+labels: ["bug"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Please use this template and include as many details as possible to help us reproduce and fix the issue.
+ - type: textarea
+ id: commit
+ attributes:
+ label: Git commit
+ description: Which commit are you trying to compile?
+ placeholder: |
+ $git rev-parse HEAD
+ 40a6a8710ec15b1b5db6b5a098409f6bc8f654a4
+ validations:
+ required: true
+ - type: input
+ id: os
+ attributes:
+ label: Operating System & Version
+ placeholder: e.g. “Ubuntu 22.04”, “Windows 11 23H2”, “macOS 14.3”
+ validations:
+ required: true
+ - type: dropdown
+ id: backends
+ attributes:
+ label: GGML backends
+ description: Which GGML backends do you know to be affected?
+ options: [CPU, CUDA, HIP, Metal, Musa, SYCL, Vulkan, OpenCL]
+ multiple: true
+ validations:
+ required: true
+ - type: input
+ id: cmd_arguments
+ attributes:
+ label: Command-line arguments used
+ placeholder: The full command line you ran (with all flags)
+ validations:
+ required: true
+ - type: textarea
+ id: steps_to_reproduce
+ attributes:
+ label: Steps to reproduce
+ placeholder: A step-by-step list of what you did
+ validations:
+ required: true
+ - type: textarea
+ id: expected_behavior
+ attributes:
+ label: What you expected to happen
+ placeholder: Describe the expected behavior or result
+ validations:
+ required: true
+ - type: textarea
+ id: actual_behavior
+ attributes:
+ label: What actually happened
+ placeholder: Describe what you saw instead (errors, logs, crash, etc.)
+ validations:
+ required: true
+ - type: textarea
+ id: logs_and_errors
+ attributes:
+ label: Logs / error messages / stack trace
+ placeholder: Paste complete logs or error output
+ - type: textarea
+ id: additional_info
+ attributes:
+ label: Additional context / environment details
+ placeholder: e.g. CPU model, GPU, RAM, model file versions, quantization type, etc.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
new file mode 100644
index 000000000..243faca4c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,33 @@
+name: 💡 Feature Request
+description: Suggest a new feature or improvement
+title: "[Feature] "
+labels: ["enhancement"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thank you for suggesting an improvement! Please fill in the fields below.
+ - type: input
+ id: summary
+ attributes:
+ label: Feature Summary
+ placeholder: A one-line summary of the feature you’d like
+ validations:
+ required: true
+ - type: textarea
+ id: description
+ attributes:
+ label: Detailed Description
+ placeholder: What problem does this solve? How do you expect it to work?
+ validations:
+ required: true
+ - type: textarea
+ id: alternatives
+ attributes:
+ label: Alternatives you considered
+ placeholder: Any alternative designs or workarounds you tried
+ - type: textarea
+ id: additional_context
+ attributes:
+ label: Additional context
+ placeholder: Any extra information (use cases, related functionalities, constraints)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f891717c9..8dd7ce19c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -65,7 +65,7 @@ jobs:
- name: Get commit hash
id: commit
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info
@@ -118,7 +118,7 @@ jobs:
- name: Get commit hash
id: commit
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/main' ) || github.event.inputs.create_release == 'true' }}
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: pr-mpt/actions-commit-hash@v2
- name: Fetch system info
@@ -149,7 +149,7 @@ jobs:
runs-on: windows-2025
env:
- VULKAN_VERSION: 1.3.261.1
+ VULKAN_VERSION: 1.4.328.1
strategy:
matrix:
@@ -164,8 +164,6 @@ jobs:
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12"
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;86;80;75"
- # - build: "rocm5.5"
- # defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
- build: 'vulkan'
defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
steps:
@@ -184,24 +182,11 @@ jobs:
method: "network"
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- - name: Install rocm-toolkit
- id: rocm-toolkit
- if: ${{ matrix.build == 'rocm5.5' }}
- uses: Cyberhan123/rocm-toolkit@v0.1.0
- with:
- rocm: "5.5.0"
-
- - name: Install Ninja
- id: install-ninja
- if: ${{ matrix.build == 'rocm5.5' }}
- uses: urkle/action-get-ninja@v1
- with:
- version: 1.11.1
- name: Install Vulkan SDK
id: get_vulkan
if: ${{ matrix.build == 'vulkan' }}
run: |
- curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/$%7Benv:VULKAN_VERSION%7D/windows/VulkanSDK-$%7Benv:VULKAN_VERSION%7D-Installer.exe"
+ curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/$%7Benv:VULKAN_VERSION%7D/windows/vulkansdk-windows-X64-$%7Benv:VULKAN_VERSION%7D.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
@@ -254,7 +239,7 @@ jobs:
- name: Copy and pack Cuda runtime
id: pack_cuda_runtime
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
+ if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
$dst='.\build\bin\cudart\'
@@ -262,7 +247,7 @@ jobs:
7z a cudart-sd-bin-win-cu12-x64.zip $dst\*
- name: Upload Cuda runtime
- if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
+ if: ${{ matrix.build == 'cuda12' && (github.event_name == 'push' && github.ref == 'refs/heads/master' || github.event.inputs.create_release == 'true') }}
uses: actions/upload-artifact@v4
with:
name: sd-cudart-sd-bin-win-cu12-x64.zip
@@ -277,6 +262,104 @@ jobs:
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
+ windows-latest-cmake-hip:
+ runs-on: windows-2022
+
+ env:
+ HIPSDK_INSTALLER_VERSION: "25.Q3"
+ GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+
+ steps:
+ - uses: actions/checkout@v3
+ with:
+ submodules: recursive
+
+ - name: Cache ROCm Installation
+ id: cache-rocm
+ uses: actions/cache@v4
+ with:
+ path: C:\Program Files\AMD\ROCm
+ key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
+
+ - name: ccache
+ uses: ggml-org/ccache-action@v1.2.16
+ with:
+ key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
+ evict-old-files: 1d
+
+ - name: Install ROCm
+ if: steps.cache-rocm.outputs.cache-hit != 'true'
+ run: |
+ $ErrorActionPreference = "Stop"
+ write-host "Downloading AMD HIP SDK Installer"
+ Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-$%7B%7B%20env.HIPSDK_INSTALLER_VERSION%20%7D%7D-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+ write-host "Installing AMD HIP SDK"
+ $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
+ $completed = $proc.WaitForExit(600000)
+ if (-not $completed) {
+ Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
+ $proc.Kill()
+ exit 1
+ }
+ if ($proc.ExitCode -ne 0) {
+ Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
+ exit 1
+ }
+ write-host "Completed AMD HIP SDK installation"
+
+ - name: Verify ROCm
+ run: |
+ # Find and test ROCm installation
+ $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
+ if (-not $clangPath) {
+ Write-Error "ROCm installation not found"
+ exit 1
+ }
+ & $clangPath.FullName --version
+ # Set HIP_PATH environment variable for later steps
+ echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
+
+ - name: Build
+ run: |
+ mkdir build
+ cd build
+ $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
+ cmake .. `
+ -G "Unix Makefiles" `
+ -DSD_HIPBLAS=ON `
+ -DSD_BUILD_SHARED_LIBS=ON `
+ -DGGML_NATIVE=OFF `
+ -DCMAKE_C_COMPILER=clang `
+ -DCMAKE_CXX_COMPILER=clang++ `
+ -DCMAKE_BUILD_TYPE=Release `
+ -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
+ cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
+
+ - name: Get commit hash
+ id: commit
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: pr-mpt/actions-commit-hash@v2
+
+ - name: Pack artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ run: |
+ md "build\bin\rocblas\library\"
+ md "build\bin\hipblaslt\library"
+ cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
+ cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
+ cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+ cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+ cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
+ 7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
+
+ - name: Upload artifacts
+ if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+ path: |
+ sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -286,8 +369,14 @@ jobs:
- ubuntu-latest-cmake
- macOS-latest-cmake
- windows-latest-cmake
+ - windows-latest-cmake-hip
steps:
+ - name: Clone
+ uses: actions/checkout@v3
+ with:
+ fetch-depth: 0
+
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v4
@@ -296,20 +385,27 @@ jobs:
pattern: sd-*
merge-multiple: true
+ - name: Get commit count
+ id: commit_count
+ run: |
+ echo "count=$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+
- name: Get commit hash
id: commit
uses: pr-mpt/actions-commit-hash@v2
- name: Create release
id: create_release
+ if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
uses: anzz1/action-create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
- tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+ tag_name: ${{ format('{0}-{1}-{2}', env.BRANCH_NAME, steps.commit_count.outputs.count, steps.commit.outputs.short) }}
- name: Upload release
id: upload_release
+ if: ${{ github.event_name == 'workflow_dispatch' || github.ref_name == 'master' }}
uses: actions/github-script@v3
with:
github-token: ${{secrets.GITHUB_TOKEN}}
diff --git a/.gitignore b/.gitignore
index dd4f6435a..b0e3af83f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@ test/
output*.png
models*
*.log
+preview.png
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0735e5b1..7dc36f0fd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ option(SD_SYCL "sd: sycl backend" OFF)
option(SD_MUSA "sd: musa backend" OFF)
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
+option(SD_BUILD_SHARED_GGML_LIB "sd: build ggml as a separate shared lib" OFF)
option(SD_USE_SYSTEM_GGML "sd: use system-installed GGML library" OFF)
#option(SD_BUILD_SERVER "sd: build server example" ON)
@@ -86,18 +87,21 @@ file(GLOB SD_LIB_SOURCES
"*.hpp"
)
-# we can get only one share lib
if(SD_BUILD_SHARED_LIBS)
message("-- Build shared library")
message(${SD_LIB_SOURCES})
- set(BUILD_SHARED_LIBS OFF)
+ if(NOT SD_BUILD_SHARED_GGML_LIB)
+ set(BUILD_SHARED_LIBS OFF)
+ endif()
add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
add_definitions(-DSD_BUILD_SHARED_LIB)
target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
else()
message("-- Build static library")
- set(BUILD_SHARED_LIBS OFF)
+ if(NOT SD_BUILD_SHARED_GGML_LIB)
+ set(BUILD_SHARED_LIBS OFF)
+ endif()
add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
endif()
diff --git a/Dockerfile b/Dockerfile
index bd9a378f0..417335793 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,16 +1,21 @@
ARG UBUNTU_VERSION=22.04
-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build
-RUN apt-get update && apt-get install -y build-essential git cmake
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential git cmake
WORKDIR /sd.cpp
COPY . .
-RUN mkdir build && cd build && cmake .. && cmake --build . --config Release
+RUN cmake . -B ./build
+RUN cmake --build ./build --config Release --parallel
-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+
+RUN apt-get update && \
+ apt-get install --yes --no-install-recommends libgomp1 && \
+ apt-get clean
COPY --from=build /sd.cpp/build/bin/sd /sd
diff --git a/Dockerfile.sycl b/Dockerfile.sycl
new file mode 100644
index 000000000..1b855d6e4
--- /dev/null
+++ b/Dockerfile.sycl
@@ -0,0 +1,19 @@
+ARG SYCL_VERSION=2025.1.0-0
+
+FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS build
+
+RUN apt-get update && apt-get install -y cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && \
+ cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
+ cmake --build . --config Release -j$(nproc)
+
+FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
+
+COPY --from=build /sd.cpp/build/bin/sd /sd
+
+ENTRYPOINT [ "/sd" ]
diff --git a/README.md b/README.md
index a4585be0c..5cc6e4458 100644
--- a/README.md
+++ b/README.md
@@ -4,25 +4,46 @@
# stable-diffusion.cpp
+
+

+
+
Diffusion model(SD,Flux,Wan,...) inference in pure C/C++
***Note that this project is under active development. \
API and command-line option may change frequently.***
+## 🔥Important News
+
+* **2025/10/13** 🚀 stable-diffusion.cpp now supports **Qwen-Image-Edit / Qwen-Image-Edit 2509**
+ 👉 Details: [PR #877](https://github.com/leejet/stable-diffusion.cpp/pull/877)
+
+* **2025/10/12** 🚀 stable-diffusion.cpp now supports **Qwen-Image**
+ 👉 Details: [PR #851](https://github.com/leejet/stable-diffusion.cpp/pull/851)
+
+* **2025/09/14** 🚀 stable-diffusion.cpp now supports **Wan2.1 Vace**
+ 👉 Details: [PR #819](https://github.com/leejet/stable-diffusion.cpp/pull/819)
+
+* **2025/09/06** 🚀 stable-diffusion.cpp now supports **Wan2.1 / Wan2.2**
+ 👉 Details: [PR #778](https://github.com/leejet/stable-diffusion.cpp/pull/778)
+
## Features
-- Plain C/C++ implementation based on [ggml](https://github.com/ggerganov/ggml), working in the same way as [llama.cpp](https://github.com/ggerganov/llama.cpp)
+- Plain C/C++ implementation based on [ggml](https://github.com/ggml-org/ggml), working in the same way as [llama.cpp](https://github.com/ggml-org/llama.cpp)
- Super lightweight and without external dependencies
- Supported models
- Image Models
- SD1.x, SD2.x, [SD-Turbo](https://huggingface.co/stabilityai/sd-turbo)
- SDXL, [SDXL-Turbo](https://huggingface.co/stabilityai/sdxl-turbo)
- - !!!The VAE in SDXL encounters NaN issues under FP16, but unfortunately, the ggml_conv_2d only operates under FP16. Hence, a parameter is needed to specify the VAE that has fixed the FP16 NaN issue. You can find it here: [SDXL VAE FP16 Fix](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix/blob/main/sdxl_vae.safetensors).
+ - [Some SD1.x and SDXL distilled models](./docs/distilled_sd.md)
- [SD3/SD3.5](./docs/sd3.md)
- [Flux-dev/Flux-schnell](./docs/flux.md)
- [Chroma](./docs/chroma.md)
+ - [Chroma1-Radiance](./docs/chroma_radiance.md)
+ - [Qwen Image](./docs/qwen_image.md)
- Image Edit Models
- [FLUX.1-Kontext-dev](./docs/kontext.md)
+ - [Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
- Video Models
- [Wan2.1/Wan2.2](./docs/wan.md)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker) support.
@@ -31,14 +52,22 @@ API and command-line option may change frequently.***
- Latent Consistency Models support (LCM/LCM-LoRA)
- Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
- Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
-- 16-bit, 32-bit float support
-- 2-bit, 3-bit, 4-bit, 5-bit and 8-bit integer quantization support
-- Accelerated memory-efficient CPU inference
- - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB.
-- AVX, AVX2 and AVX512 support for x86 architectures
-- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration.
-- Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models
- - No need to convert to `.ggml` or `.gguf` anymore!
+- Supported backends
+ - CPU (AVX, AVX2 and AVX512 support for x86 architectures)
+ - CUDA
+ - Vulkan
+ - Metal
+ - OpenCL
+ - SYCL
+- Supported weight formats
+ - Pytorch checkpoint (`.ckpt` or `.pth`)
+ - Safetensors (`./safetensors`)
+ - GGUF (`.gguf`)
+- Supported platforms
+ - Linux
+ - Mac OS
+ - Windows
+ - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
- Flash Attention for memory usage optimization
- Negative prompt
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
@@ -54,371 +83,45 @@ API and command-line option may change frequently.***
- [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
- Embedds generation parameters into png output as webui-compatible text string
-- Supported platforms
- - Linux
- - Mac OS
- - Windows
- - Android (via Termux, [Local Diffusion](https://github.com/rmatif/Local-Diffusion))
-
-### TODO
-
-- [ ] More sampling methods
-- [ ] Make inference faster
- - The current implementation of ggml_conv_2d is slow and has high memory usage
-- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
-- [ ] Implement Inpainting support
-## Usage
+## Quick Start
-For most users, you can download the built executable program from the latest [release](https://github.com/leejet/stable-diffusion.cpp/releases/latest).
-If the built product does not meet your requirements, you can choose to build it manually.
+### Get the sd executable
-### Get the Code
-
-```
-git clone --recursive https://github.com/leejet/stable-diffusion.cpp
-cd stable-diffusion.cpp
-```
+- Download pre-built binaries from the [releases page](https://github.com/leejet/stable-diffusion.cpp/releases)
+- Or build from source by following the [build guide](./docs/build.md)
-- If you have already cloned the repository, you can use the following command to update the repository to the latest code.
+### Download model weights
-```
-cd stable-diffusion.cpp
-git pull origin master
-git submodule init
-git submodule update
-```
-
-### Download weights
-
-- download original weights(.ckpt or .safetensors). For example
- - Stable Diffusion v1.4 from https://huggingface.co/CompVis/stable-diffusion-v-1-4-original
+- download weights(.ckpt or .safetensors or .gguf). For example
- Stable Diffusion v1.5 from https://huggingface.co/runwayml/stable-diffusion-v1-5
- - Stable Diffuison v2.1 from https://huggingface.co/stabilityai/stable-diffusion-2-1
- - Stable Diffusion 3 2B from https://huggingface.co/stabilityai/stable-diffusion-3-medium
-
- ```shell
- curl -L -O https://huggingface.co/CompVis/stable-diffusion-v-1-4-original/resolve/main/sd-v1-4.ckpt
- # curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
- # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-2-1/resolve/main/v2-1_768-nonema-pruned.safetensors
- # curl -L -O https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium_incl_clips_t5xxlfp16.safetensors
- ```
-
-### Build
-
-#### Build from scratch
-
-```shell
-mkdir build
-cd build
-cmake ..
-cmake --build . --config Release
-```
-
-##### Using OpenBLAS
-
-```
-cmake .. -DGGML_OPENBLAS=ON
-cmake --build . --config Release
-```
-
-##### Using CUDA
-
-This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
-
-```
-cmake .. -DSD_CUDA=ON
-cmake --build . --config Release
-```
-
-##### Using HipBLAS
-This provides BLAS acceleration using the ROCm cores of your AMD GPU. Make sure to have the ROCm toolkit installed.
-
-Windows User Refer to [docs/hipBLAS_on_Windows.md](docs%2FhipBLAS_on_Windows.md) for a comprehensive guide.
-
-```
-export GFX_NAME=$(rocminfo | grep -m 1 -E "gfx[^0]{1}" | sed -e 's/ *Name: *//' | awk '{$1=$1; print}' || echo "rocminfo missing")
-echo $GFX_NAME
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGPU_TARGETS=$GFX_NAME -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
-cmake --build . --config Release
-```
-
-##### Using MUSA
-
-This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
-
-```bash
-cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
-cmake --build . --config Release
-```
-
-##### Using Metal
-
-Using Metal makes the computation run on the GPU. Currently, there are some issues with Metal when performing operations on very large matrices, making it highly inefficient at the moment. Performance improvements are expected in the near future.
-
-```
-cmake .. -DSD_METAL=ON
-cmake --build . --config Release
-```
-
-##### Using Vulkan
-Install Vulkan SDK from https://www.lunarg.com/vulkan-sdk/.
-
-```
-cmake .. -DSD_VULKAN=ON
-cmake --build . --config Release
-```
-
-##### Using OpenCL (for Adreno GPU)
-
-Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type
-
-To build for Windows ARM please refers to [Windows 11 Arm64
-](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64)
-
-Building for Android:
-
- Android NDK:
- Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads).
-
-Setup OpenCL Dependencies for NDK:
-
-You need to provide OpenCL headers and the ICD loader library to your NDK sysroot.
-
-* OpenCL Headers:
- ```bash
- # In a temporary working directory
- git clone https://github.com/KhronosGroup/OpenCL-Headers
- cd OpenCL-Headers
- # Replace with your actual NDK installation path
- # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
- sudo cp -r CL /toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
- cd ..
+ ```sh
+ curl -L -O https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
```
-* OpenCL ICD Loader:
- ```bash
- # In the same temporary working directory
- git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
- cd OpenCL-ICD-Loader
- mkdir build_ndk && cd build_ndk
-
- # Replace in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR
- cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
- -DCMAKE_TOOLCHAIN_FILE=/build/cmake/android.toolchain.cmake \
- -DOPENCL_ICD_LOADER_HEADERS_DIR=/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
- -DANDROID_ABI=arm64-v8a \
- -DANDROID_PLATFORM=24 \
- -DANDROID_STL=c++_shared
-
- ninja
- # Replace
- # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
- sudo cp libOpenCL.so /toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
- cd ../..
- ```
-
-Build `stable-diffusion.cpp` for Android with OpenCL:
-
-```bash
-mkdir build-android && cd build-android
-
-# Replace with your actual NDK installation path
-# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake
-cmake .. -G Ninja \
- -DCMAKE_TOOLCHAIN_FILE=/build/cmake/android.toolchain.cmake \
- -DANDROID_ABI=arm64-v8a \
- -DANDROID_PLATFORM=android-28 \
- -DGGML_OPENMP=OFF \
- -DSD_OPENCL=ON
-
-ninja
-```
-*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)*
-
-##### Using SYCL
-
-Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [Intel® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux).
-
-```
-# Export relevant ENV variables
-source /opt/intel/oneapi/setvars.sh
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
-cmake .. -DSD_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-
-cmake --build . --config Release
-```
-
-Example of text2img by using SYCL backend:
-
-- download `stable-diffusion` model weight, refer to [download-weight](#download-weights).
-
-- run `./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors --cfg-scale 5 --steps 30 --sampling-method euler -H 1024 -W 1024 --seed 42 -p "fantasy medieval village world inside a glass sphere , high detail, fantasy, realistic, light effect, hyper detail, volumetric lighting, cinematic, macro, depth of field, blur, red light and clouds from the back, highly detailed epic cinematic concept art cg render made in maya, blender and photoshop, octane render, excellent composition, dynamic dramatic cinematic lighting, aesthetic, very inspirational, world inside a glass sphere by james gurney by artgerm with james jean, joe fenton and tristan eaton by ross tran, fine details, 4k resolution"`
-
-
-
-
-
-
-
-##### Using Flash Attention
-
-Enabling flash attention for the diffusion model reduces memory usage by varying amounts of MB.
-eg.:
- - flux 768x768 ~600mb
- - SD2 768x768 ~1400mb
-
-For most backends, it slows things down, but for cuda it generally speeds it up too.
-At the moment, it is only supported for some models and some backends (like cpu, cuda/rocm, metal).
-
-Run by adding `--diffusion-fa` to the arguments and watch for:
-```
-[INFO ] stable-diffusion.cpp:312 - Using flash attention in the diffusion model
-```
-and the compute buffer shrink in the debug log:
-```
-[DEBUG] ggml_extend.hpp:1004 - flux compute buffer size: 650.00 MB(VRAM)
-```
-
-### Run
-
-```
-usage: ./bin/sd [arguments]
-
-arguments:
- -h, --help show this help message and exit
- -M, --mode [MODE] run mode, one of: [img_gen, vid_gen, convert], default: img_gen
- -t, --threads N number of threads to use during computation (default: -1)
- If threads <= 0, then threads will be set to the number of CPU physical cores
- --offload-to-cpu place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
- -m, --model [MODEL] path to full model
- --diffusion-model path to the standalone diffusion model
- --high-noise-diffusion-model path to the standalone high noise diffusion model
- --clip_l path to the clip-l text encoder
- --clip_g path to the clip-g text encoder
- --clip_vision path to the clip-vision encoder
- --t5xxl path to the t5xxl text encoder
- --vae [VAE] path to vae
- --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
- --control-net [CONTROL_PATH] path to control net model
- --embd-dir [EMBEDDING_PATH] path to embeddings
- --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings
- --input-id-images-dir [DIR] path to PHOTOMAKER input id images dir
- --normalize-input normalize PHOTOMAKER input id images
- --upscale-model [ESRGAN_PATH] path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now
- --upscale-repeats Run the ESRGAN upscaler this many times (default 1)
- --type [TYPE] weight type (examples: f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_K, q3_K, q4_K)
- If not specified, the default is the type of the weight file
- --tensor-type-rules [EXPRESSION] weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
- --lora-model-dir [DIR] lora model directory
- -i, --init-img [IMAGE] path to the init image, required by img2img
- --mask [MASK] path to the mask image, required by img2img with mask
- -i, --end-img [IMAGE] path to the end image, required by flf2v
- --control-image [IMAGE] path to image condition, control net
- -r, --ref-image [PATH] reference image for Flux Kontext models (can be used multiple times)
- --increase-ref-index automatically increase the indices of references images based on the order they are listed (starting with 1).
- -o, --output OUTPUT path to write result image to (default: ./output.png)
- -p, --prompt [PROMPT] the prompt to render
- -n, --negative-prompt PROMPT the negative prompt (default: "")
- --cfg-scale SCALE unconditional guidance scale: (default: 7.0)
- --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
- --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)
- --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)
- 0 means disabled, a value of 2.5 is nice for sd3.5 medium
- --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)
- --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
- --skip-layer-start START SLG enabling point: (default: 0.01)
- --skip-layer-end END SLG disabling point: (default: 0.2)
- --scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
- --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
- sampling method (default: "euler_a")
- --steps STEPS number of sample steps (default: 20)
- --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)
- --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)
- --high-noise-guidance SCALE (high noise) distilled guidance scale for models with guidance input (default: 3.5)
- --high-noise-slg-scale SCALE (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
- 0 means disabled, a value of 2.5 is nice for sd3.5 medium
- --high-noise-eta SCALE (high noise) eta in DDIM, only for DDIM and TCD: (default: 0)
- --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])
- --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)
- --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)
- --high-noise-scheduler {discrete, karras, exponential, ays, gits} Denoiser sigma scheduler (default: discrete)
- --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}
- (high noise) sampling method (default: "euler_a")
- --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)
- SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
- --strength STRENGTH strength for noising/unnoising (default: 0.75)
- --style-ratio STYLE-RATIO strength for keeping input identity (default: 20)
- --control-strength STRENGTH strength to apply Control Net (default: 0.9)
- 1.0 corresponds to full destruction of information in init image
- -H, --height H image height, in pixel space (default: 512)
- -W, --width W image width, in pixel space (default: 512)
- --rng {std_default, cuda} RNG (default: cuda)
- -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)
- -b, --batch-count COUNT number of images to generate
- --clip-skip N ignore last_dot_pos layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
- <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
- --vae-tiling process vae in tiles to reduce memory usage
- --vae-on-cpu keep vae in cpu (for low vram)
- --clip-on-cpu keep clip in cpu (for low vram)
- --diffusion-fa use flash attention in the diffusion model (for low vram)
- Might lower quality, since it implies converting k and v to f16.
- This might crash if it is not supported by the backend.
- --diffusion-conv-direct use Conv2d direct in the diffusion model
- This might crash if it is not supported by the backend.
- --vae-conv-direct use Conv2d direct in the vae model (should improve the performance)
- This might crash if it is not supported by the backend.
- --control-net-cpu keep controlnet in cpu (for low vram)
- --canny apply canny preprocessor (edge detection)
- --color colors the logging tags according to level
- --chroma-disable-dit-mask disable dit mask for chroma
- --chroma-enable-t5-mask enable t5 mask for chroma
- --chroma-t5-mask-pad PAD_SIZE t5 mask pad size of chroma
- --video-frames video frames (default: 1)
- --fps fps (default: 24)
- --moe-boundary BOUNDARY timestep boundary for Wan2.2 MoE model. (default: 0.875)
- only enabled if `--high-noise-steps` is set to -1
- --flow-shift SHIFT shift value for Flow models like SD3.x or WAN (default: auto)
- -v, --verbose print extra info
-```
-
-#### txt2img example
+### Generate an image with just one command
```sh
-./bin/sd -m ../models/sd-v1-4.ckpt -p "a lovely cat"
-# ./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
-# ./bin/sd -m ../models/sd_xl_base_1.0.safetensors --vae ../models/sdxl_vae-fp16-fix.safetensors -H 1024 -W 1024 -p "a lovely cat" -v
-# ./bin/sd -m ../models/sd3_medium_incl_clips_t5xxlfp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable Diffusion CPP\"' --cfg-scale 4.5 --sampling-method euler -v
-# ./bin/sd --diffusion-model ../models/flux1-dev-q3_k.gguf --vae ../models/ae.sft --clip_l ../models/clip_l.safetensors --t5xxl ../models/t5xxl_fp16.safetensors -p "a lovely cat holding a sign says 'flux.cpp'" --cfg-scale 1.0 --sampling-method euler -v
-# ./bin/sd -m ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
+./bin/sd -m ../models/v1-5-pruned-emaonly.safetensors -p "a lovely cat"
```
-Using formats of different precisions will yield results of varying quality.
-
-| f32 | f16 |q8_0 |q5_0 |q5_1 |q4_0 |q4_1 |
-| ---- |---- |---- |---- |---- |---- |---- |
-|  | | | | | | |
-
-#### img2img example
+***For detailed command-line arguments, check out [cli doc](./examples/cli/README.md).***
-- `./output.png` is the image generated from the above txt2img pipeline
+## Performance
-
-```
-./bin/sd -m ../models/sd-v1-4.ckpt -p "cat with blue eyes" -i ./output.png -o ./img2img_output.png --strength 0.4
-```
-
-
-
-
+If you want to improve performance or reduce VRAM/RAM usage, please refer to [performance guide](./docs/performance.md).
## More Guides
+- [SD1.x/SD2.x/SDXL](./docs/sd.md)
+- [SD3/SD3.5](./docs/sd3.md)
+- [Flux-dev/Flux-schnell](./docs/flux.md)
+- [FLUX.1-Kontext-dev](./docs/kontext.md)
+- [Chroma](./docs/chroma.md)
+- [🔥Qwen Image](./docs/qwen_image.md)
+- [🔥Qwen Image Edit/Qwen Image Edit 2509](./docs/qwen_image_edit.md)
+- [🔥Wan2.1/Wan2.2](./docs/wan.md)
- [LoRA](./docs/lora.md)
- [LCM/LCM-LoRA](./docs/lcm.md)
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)
@@ -448,6 +151,8 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
- [Local Diffusion](https://github.com/rmatif/Local-Diffusion)
- [sd.cpp-webui](https://github.com/daniandtheweb/sd.cpp-webui)
- [LocalAI](https://github.com/mudler/LocalAI)
+- [Neural-Pixel](https://github.com/Luiz-Alcantara/Neural-Pixel)
+- [KoboldCpp](https://github.com/LostRuins/koboldcpp)
## Contributors
@@ -461,7 +166,8 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
## References
-- [ggml](https://github.com/ggerganov/ggml)
+- [ggml](https://github.com/ggml-org/ggml)
+- [diffusers](https://github.com/huggingface/diffusers)
- [stable-diffusion](https://github.com/CompVis/stable-diffusion)
- [sd3-ref](https://github.com/Stability-AI/sd3-ref)
- [stable-diffusion-stability-ai](https://github.com/Stability-AI/stablediffusion)
@@ -472,4 +178,4 @@ Thank you to all the people who have already contributed to stable-diffusion.cpp
- [generative-models](https://github.com/Stability-AI/generative-models/)
- [PhotoMaker](https://github.com/TencentARC/PhotoMaker)
- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
-- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
\ No newline at end of file
+- [Wan2.2](https://github.com/Wan-Video/Wan2.2)
diff --git a/assets/flux/chroma1-radiance.png b/assets/flux/chroma1-radiance.png
new file mode 100644
index 000000000..1dd4a524a
Binary files /dev/null and b/assets/flux/chroma1-radiance.png differ
diff --git a/assets/qwen/example.png b/assets/qwen/example.png
new file mode 100644
index 000000000..f51bbd519
Binary files /dev/null and b/assets/qwen/example.png differ
diff --git a/assets/qwen/qwen_image_edit.png b/assets/qwen/qwen_image_edit.png
new file mode 100644
index 000000000..c2a31eda9
Binary files /dev/null and b/assets/qwen/qwen_image_edit.png differ
diff --git a/assets/qwen/qwen_image_edit_2509.png b/assets/qwen/qwen_image_edit_2509.png
new file mode 100644
index 000000000..442ba9b3c
Binary files /dev/null and b/assets/qwen/qwen_image_edit_2509.png differ
diff --git a/assets/wan/Wan2.1_1.3B_vace_r2v.mp4 b/assets/wan/Wan2.1_1.3B_vace_r2v.mp4
new file mode 100644
index 000000000..05f6cfa2f
Binary files /dev/null and b/assets/wan/Wan2.1_1.3B_vace_r2v.mp4 differ
diff --git a/assets/wan/Wan2.1_1.3B_vace_t2v.mp4 b/assets/wan/Wan2.1_1.3B_vace_t2v.mp4
new file mode 100644
index 000000000..73862e84d
Binary files /dev/null and b/assets/wan/Wan2.1_1.3B_vace_t2v.mp4 differ
diff --git a/assets/wan/Wan2.1_1.3B_vace_v2v.mp4 b/assets/wan/Wan2.1_1.3B_vace_v2v.mp4
new file mode 100644
index 000000000..2cc4c0a93
Binary files /dev/null and b/assets/wan/Wan2.1_1.3B_vace_v2v.mp4 differ
diff --git a/assets/wan/Wan2.1_14B_vace_r2v.mp4 b/assets/wan/Wan2.1_14B_vace_r2v.mp4
new file mode 100644
index 000000000..686371fb9
Binary files /dev/null and b/assets/wan/Wan2.1_14B_vace_r2v.mp4 differ
diff --git a/assets/wan/Wan2.1_14B_vace_t2v.mp4 b/assets/wan/Wan2.1_14B_vace_t2v.mp4
new file mode 100644
index 000000000..cebe8f974
Binary files /dev/null and b/assets/wan/Wan2.1_14B_vace_t2v.mp4 differ
diff --git a/assets/wan/Wan2.1_14B_vace_v2v.mp4 b/assets/wan/Wan2.1_14B_vace_v2v.mp4
new file mode 100644
index 000000000..95f30d45e
Binary files /dev/null and b/assets/wan/Wan2.1_14B_vace_v2v.mp4 differ
diff --git a/clip.hpp b/clip.hpp
index f92c9c2fa..e2a892ca0 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -6,7 +6,7 @@
/*================================================== CLIPTokenizer ===================================================*/
-std::pair, std::string> extract_and_remove_lora(std::string text) {
+__STATIC_INLINE__ std::pair, std::string> extract_and_remove_lora(std::string text) {
std::regex re("]+)>");
std::smatch matches;
std::unordered_map filename2multiplier;
@@ -31,7 +31,7 @@ std::pair, std::string> extract_and_remov
return std::make_pair(filename2multiplier, text);
}
-std::vector> bytes_to_unicode() {
+__STATIC_INLINE__ std::vector> bytes_to_unicode() {
std::vector> byte_unicode_pairs;
std::set byte_set;
for (int b = static_cast('!'); b <= static_cast('~'); ++b) {
@@ -451,16 +451,16 @@ struct CLIPMLP : public GGMLBlock {
}
}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, n_token, d_model]
auto fc1 = std::dynamic_pointer_cast(blocks["fc1"]);
auto fc2 = std::dynamic_pointer_cast(blocks["fc2"]);
x = fc1->forward(ctx, x);
if (use_gelu) {
- x = ggml_gelu_inplace(ctx, x);
+ x = ggml_gelu_inplace(ctx->ggml_ctx, x);
} else {
- x = ggml_gelu_quick_inplace(ctx, x);
+ x = ggml_gelu_quick_inplace(ctx->ggml_ctx, x);
}
x = fc2->forward(ctx, x);
return x;
@@ -476,11 +476,12 @@ struct CLIPLayer : public GGMLBlock {
public:
CLIPLayer(int64_t d_model,
int64_t n_head,
- int64_t intermediate_size)
+ int64_t intermediate_size,
+ bool proj_in = false)
: d_model(d_model),
n_head(n_head),
intermediate_size(intermediate_size) {
- blocks["self_attn"] = std::shared_ptr(new MultiheadAttention(d_model, n_head, true, true));
+ blocks["self_attn"] = std::shared_ptr(new MultiheadAttention(d_model, n_head, true, true, proj_in));
blocks["layer_norm1"] = std::shared_ptr(new LayerNorm(d_model));
blocks["layer_norm2"] = std::shared_ptr(new LayerNorm(d_model));
@@ -488,15 +489,15 @@ struct CLIPLayer : public GGMLBlock {
blocks["mlp"] = std::shared_ptr(new CLIPMLP(d_model, intermediate_size));
}
- struct ggml_tensor* forward(struct ggml_context* ctx, ggml_backend_t backend, struct ggml_tensor* x, bool mask = true) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, bool mask = true) {
// x: [N, n_token, d_model]
auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]);
auto layer_norm1 = std::dynamic_pointer_cast(blocks["layer_norm1"]);
auto layer_norm2 = std::dynamic_pointer_cast(blocks["layer_norm2"]);
auto mlp = std::dynamic_pointer_cast(blocks["mlp"]);
- x = ggml_add(ctx, x, self_attn->forward(ctx, backend, layer_norm1->forward(ctx, x), mask));
- x = ggml_add(ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
+ x = ggml_add(ctx->ggml_ctx, x, self_attn->forward(ctx, layer_norm1->forward(ctx, x), mask));
+ x = ggml_add(ctx->ggml_ctx, x, mlp->forward(ctx, layer_norm2->forward(ctx, x)));
return x;
}
};
@@ -509,16 +510,16 @@ struct CLIPEncoder : public GGMLBlock {
CLIPEncoder(int64_t n_layer,
int64_t d_model,
int64_t n_head,
- int64_t intermediate_size)
+ int64_t intermediate_size,
+ bool proj_in = false)
: n_layer(n_layer) {
for (int i = 0; i < n_layer; i++) {
std::string name = "layers." + std::to_string(i);
- blocks[name] = std::shared_ptr(new CLIPLayer(d_model, n_head, intermediate_size));
+ blocks[name] = std::shared_ptr(new CLIPLayer(d_model, n_head, intermediate_size, proj_in));
}
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
int clip_skip = -1,
bool mask = true) {
@@ -536,7 +537,7 @@ struct CLIPEncoder : public GGMLBlock {
}
std::string name = "layers." + std::to_string(i);
auto layer = std::dynamic_pointer_cast(blocks[name]);
- x = layer->forward(ctx, backend, x, mask); // [N, n_token, d_model]
+ x = layer->forward(ctx, x, mask); // [N, n_token, d_model]
// LOG_DEBUG("layer %d", i);
}
return x;
@@ -548,11 +549,17 @@ class CLIPEmbeddings : public GGMLBlock {
int64_t embed_dim;
int64_t vocab_size;
int64_t num_positions;
-
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
- enum ggml_type token_wtype = GGML_TYPE_F32;
- enum ggml_type position_wtype = GGML_TYPE_F32;
-
+ bool force_clip_f32;
+
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+ enum ggml_type token_wtype = GGML_TYPE_F32;
+ if (!force_clip_f32) {
+ token_wtype = get_type(prefix + "token_embedding.weight", tensor_storage_map, GGML_TYPE_F32);
+ if (!support_get_rows(token_wtype)) {
+ token_wtype = GGML_TYPE_F32;
+ }
+ }
+ enum ggml_type position_wtype = GGML_TYPE_F32;
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
}
@@ -560,17 +567,19 @@ class CLIPEmbeddings : public GGMLBlock {
public:
CLIPEmbeddings(int64_t embed_dim,
int64_t vocab_size = 49408,
- int64_t num_positions = 77)
+ int64_t num_positions = 77,
+ bool force_clip_f32 = false)
: embed_dim(embed_dim),
vocab_size(vocab_size),
- num_positions(num_positions) {
+ num_positions(num_positions),
+ force_clip_f32(force_clip_f32) {
}
struct ggml_tensor* get_token_embed_weight() {
return params["token_embedding.weight"];
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids,
struct ggml_tensor* custom_embed_weight) {
// input_ids: [N, n_token]
@@ -578,12 +587,12 @@ class CLIPEmbeddings : public GGMLBlock {
auto position_embed_weight = params["position_embedding.weight"];
GGML_ASSERT(input_ids->ne[0] == position_embed_weight->ne[1]);
- input_ids = ggml_reshape_3d(ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
- auto token_embedding = ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids);
- token_embedding = ggml_reshape_3d(ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
+ input_ids = ggml_reshape_3d(ctx->ggml_ctx, input_ids, input_ids->ne[0], 1, input_ids->ne[1]);
+ auto token_embedding = ggml_get_rows(ctx->ggml_ctx, custom_embed_weight != nullptr ? custom_embed_weight : token_embed_weight, input_ids);
+ token_embedding = ggml_reshape_3d(ctx->ggml_ctx, token_embedding, token_embedding->ne[0], token_embedding->ne[1], token_embedding->ne[3]);
// token_embedding + position_embedding
- auto x = ggml_add(ctx,
+ auto x = ggml_add(ctx->ggml_ctx,
token_embedding,
position_embed_weight); // [N, n_token, embed_dim]
return x;
@@ -598,7 +607,8 @@ class CLIPVisionEmbeddings : public GGMLBlock {
int64_t image_size;
int64_t num_patches;
int64_t num_positions;
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
enum ggml_type patch_wtype = GGML_TYPE_F16;
enum ggml_type class_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32;
@@ -621,7 +631,7 @@ class CLIPVisionEmbeddings : public GGMLBlock {
num_positions = num_patches + 1;
}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* pixel_values) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* pixel_values) {
// pixel_values: [N, num_channels, image_size, image_size]
// return: [N, num_positions, embed_dim]
GGML_ASSERT(pixel_values->ne[0] == image_size && pixel_values->ne[1] == image_size && pixel_values->ne[2] == num_channels);
@@ -633,18 +643,18 @@ class CLIPVisionEmbeddings : public GGMLBlock {
// concat(patch_embedding, class_embedding) + position_embedding
struct ggml_tensor* patch_embedding;
int64_t N = pixel_values->ne[3];
- patch_embedding = ggml_nn_conv_2d(ctx, pixel_values, patch_embed_weight, NULL, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
- patch_embedding = ggml_reshape_3d(ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
- patch_embedding = ggml_cont(ctx, ggml_permute(ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
- patch_embedding = ggml_reshape_4d(ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
-
- struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, embed_dim, N);
- class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim]
- class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
-
- struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
- x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
- x = ggml_add(ctx, x, position_embed_weight);
+ patch_embedding = ggml_ext_conv_2d(ctx->ggml_ctx, pixel_values, patch_embed_weight, nullptr, patch_size, patch_size); // [N, embed_dim, image_size // pacht_size, image_size // pacht_size]
+ patch_embedding = ggml_reshape_3d(ctx->ggml_ctx, patch_embedding, num_patches, embed_dim, N); // [N, embed_dim, num_patches]
+ patch_embedding = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, patch_embedding, 1, 0, 2, 3)); // [N, num_patches, embed_dim]
+ patch_embedding = ggml_reshape_4d(ctx->ggml_ctx, patch_embedding, 1, embed_dim, num_patches, N); // [N, num_patches, embed_dim, 1]
+
+ struct ggml_tensor* class_embedding = ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, embed_dim, N);
+ class_embedding = ggml_repeat(ctx->ggml_ctx, class_embed_weight, class_embedding); // [N, embed_dim]
+ class_embedding = ggml_reshape_4d(ctx->ggml_ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
+
+ struct ggml_tensor* x = ggml_concat(ctx->ggml_ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
+ x = ggml_reshape_3d(ctx->ggml_ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
+ x = ggml_add(ctx->ggml_ctx, x, position_embed_weight);
return x; // [N, num_positions, embed_dim]
}
};
@@ -661,7 +671,7 @@ enum CLIPVersion {
class CLIPTextModel : public GGMLBlock {
protected:
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
if (version == OPEN_CLIP_VIT_BIGG_14) {
enum ggml_type wtype = GGML_TYPE_F32;
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
@@ -678,12 +688,12 @@ class CLIPTextModel : public GGMLBlock {
int32_t n_head = 12;
int32_t n_layer = 12; // num_hidden_layers
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
- int32_t clip_skip = -1;
bool with_final_ln = true;
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
- int clip_skip_value = -1)
+ bool force_clip_f32 = false,
+ bool proj_in = false)
: version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024;
@@ -696,47 +706,39 @@ class CLIPTextModel : public GGMLBlock {
n_head = 20;
n_layer = 32;
}
- set_clip_skip(clip_skip_value);
- blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
- blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
+ blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
+ blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size));
}
- void set_clip_skip(int skip) {
- if (skip <= 0) {
- skip = -1;
- }
- clip_skip = skip;
- }
-
struct ggml_tensor* get_token_embed_weight() {
auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]);
return embeddings->get_token_embed_weight();
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids,
struct ggml_tensor* tkn_embeddings,
size_t max_token_idx = 0,
- bool return_pooled = false) {
+ bool return_pooled = false,
+ int clip_skip = -1) {
// input_ids: [N, n_token]
auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]);
auto encoder = std::dynamic_pointer_cast(blocks["encoder"]);
auto final_layer_norm = std::dynamic_pointer_cast(blocks["final_layer_norm"]);
auto x = embeddings->forward(ctx, input_ids, tkn_embeddings); // [N, n_token, hidden_size]
- x = encoder->forward(ctx, backend, x, return_pooled ? -1 : clip_skip, true);
+ x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
if (return_pooled || with_final_ln) {
x = final_layer_norm->forward(ctx, x);
}
if (return_pooled) {
auto text_projection = params["text_projection"];
- ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
- if (text_projection != NULL) {
- pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
+ ggml_tensor* pooled = ggml_view_1d(ctx->ggml_ctx, x, hidden_size, x->nb[1] * max_token_idx);
+ if (text_projection != nullptr) {
+ pooled = ggml_ext_linear(ctx->ggml_ctx, pooled, text_projection, nullptr);
} else {
LOG_DEBUG("identity projection");
}
@@ -760,7 +762,7 @@ class CLIPVisionModel : public GGMLBlock {
int32_t n_layer = 24;
public:
- CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14) {
+ CLIPVisionModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool proj_in = false) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1280;
intermediate_size = 5120;
@@ -775,12 +777,11 @@ class CLIPVisionModel : public GGMLBlock {
blocks["embeddings"] = std::shared_ptr(new CLIPVisionEmbeddings(hidden_size, num_channels, patch_size, image_size));
blocks["pre_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size));
- blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
+ blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size, proj_in));
blocks["post_layernorm"] = std::shared_ptr(new LayerNorm(hidden_size));
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values,
bool return_pooled = true,
int clip_skip = -1) {
@@ -792,14 +793,14 @@ class CLIPVisionModel : public GGMLBlock {
auto x = embeddings->forward(ctx, pixel_values); // [N, num_positions, embed_dim]
x = pre_layernorm->forward(ctx, x);
- x = encoder->forward(ctx, backend, x, clip_skip, false);
+ x = encoder->forward(ctx, x, clip_skip, false);
// print_ggml_tensor(x, true, "ClipVisionModel x: ");
auto last_hidden_state = x;
x = post_layernorm->forward(ctx, x); // [N, n_token, hidden_size]
GGML_ASSERT(x->ne[3] == 1);
if (return_pooled) {
- ggml_tensor* pooled = ggml_cont(ctx, ggml_view_2d(ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
+ ggml_tensor* pooled = ggml_cont(ctx->ggml_ctx, ggml_view_2d(ctx->ggml_ctx, x, x->ne[0], x->ne[2], x->nb[2], 0));
return pooled; // [N, hidden_size]
} else {
// return x; // [N, n_token, hidden_size]
@@ -814,8 +815,8 @@ class CLIPProjection : public UnaryBlock {
int64_t out_features;
bool transpose_weight;
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
- enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+ enum ggml_type wtype = get_type(prefix + "weight", tensor_storage_map, GGML_TYPE_F32);
if (transpose_weight) {
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
} else {
@@ -831,12 +832,12 @@ class CLIPProjection : public UnaryBlock {
out_features(out_features),
transpose_weight(transpose_weight) {}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
struct ggml_tensor* w = params["weight"];
if (transpose_weight) {
- w = ggml_cont(ctx, ggml_transpose(ctx, w));
+ w = ggml_cont(ctx->ggml_ctx, ggml_transpose(ctx->ggml_ctx, w));
}
- return ggml_nn_linear(ctx, x, w, NULL);
+ return ggml_ext_linear(ctx->ggml_ctx, x, w, nullptr);
}
};
@@ -848,7 +849,8 @@ class CLIPVisionModelProjection : public GGMLBlock {
public:
CLIPVisionModelProjection(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
- bool transpose_proj_w = false) {
+ bool transpose_proj_w = false,
+ bool proj_in = false) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1280;
projection_dim = 1024;
@@ -856,12 +858,11 @@ class CLIPVisionModelProjection : public GGMLBlock {
hidden_size = 1664;
}
- blocks["vision_model"] = std::shared_ptr(new CLIPVisionModel(version));
+ blocks["vision_model"] = std::shared_ptr(new CLIPVisionModel(version, proj_in));
blocks["visual_projection"] = std::shared_ptr(new CLIPProjection(hidden_size, projection_dim, transpose_proj_w));
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* pixel_values,
bool return_pooled = true,
int clip_skip = -1) {
@@ -870,7 +871,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
auto vision_model = std::dynamic_pointer_cast(blocks["vision_model"]);
auto visual_projection = std::dynamic_pointer_cast(blocks["visual_projection"]);
- auto x = vision_model->forward(ctx, backend, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
+ auto x = vision_model->forward(ctx, pixel_values, return_pooled, clip_skip); // [N, hidden_size] or [N, n_token, hidden_size]
if (return_pooled) {
x = visual_projection->forward(ctx, x); // [N, projection_dim]
@@ -885,55 +886,63 @@ struct CLIPTextModelRunner : public GGMLRunner {
CLIPTextModelRunner(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types,
+ const String2TensorStorage& tensor_storage_map,
const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
bool with_final_ln = true,
- int clip_skip_value = -1)
- : GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
- model.init(params_ctx, tensor_types, prefix);
+ bool force_clip_f32 = false)
+ : GGMLRunner(backend, offload_params_to_cpu) {
+ bool proj_in = false;
+ for (const auto& [name, tensor_storage] : tensor_storage_map) {
+ if (!starts_with(name, prefix)) {
+ continue;
+ }
+ if (contains(name, "self_attn.in_proj")) {
+ proj_in = true;
+ break;
+ }
+ }
+ model = CLIPTextModel(version, with_final_ln, force_clip_f32, proj_in);
+ model.init(params_ctx, tensor_storage_map, prefix);
}
- std::string get_desc() {
+ std::string get_desc() override {
return "clip";
}
- void set_clip_skip(int clip_skip) {
- model.set_clip_skip(clip_skip);
- }
-
void get_param_tensors(std::map& tensors, const std::string prefix) {
model.get_param_tensors(tensors, prefix);
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* input_ids,
struct ggml_tensor* embeddings,
size_t max_token_idx = 0,
- bool return_pooled = false) {
+ bool return_pooled = false,
+ int clip_skip = -1) {
size_t N = input_ids->ne[1];
size_t n_token = input_ids->ne[0];
if (input_ids->ne[0] > model.n_token) {
GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
- input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
+ input_ids = ggml_reshape_2d(ctx->ggml_ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
}
- return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
+ return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
}
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
int num_custom_embeddings = 0,
- void* custom_embeddings_data = NULL,
+ void* custom_embeddings_data = nullptr,
size_t max_token_idx = 0,
- bool return_pooled = false) {
- struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
+ bool return_pooled = false,
+ int clip_skip = -1) {
+ struct ggml_cgraph* gf = new_graph_custom(2048);
input_ids = to_backend(input_ids);
- struct ggml_tensor* embeddings = NULL;
+ struct ggml_tensor* embeddings = nullptr;
- if (num_custom_embeddings > 0 && custom_embeddings_data != NULL) {
+ if (num_custom_embeddings > 0 && custom_embeddings_data != nullptr) {
auto token_embed_weight = model.get_token_embed_weight();
auto custom_embeddings = ggml_new_tensor_2d(compute_ctx,
token_embed_weight->type,
@@ -945,7 +954,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
}
- struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
+ auto runner_ctx = get_context();
+
+ struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states);
@@ -958,10 +969,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
void* custom_embeddings_data,
size_t max_token_idx,
bool return_pooled,
+ int clip_skip,
ggml_tensor** output,
- ggml_context* output_ctx = NULL) {
+ ggml_context* output_ctx = nullptr) {
auto get_graph = [&]() -> struct ggml_cgraph* {
- return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
+ return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
};
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
}
diff --git a/common.hpp b/common.hpp
index bf4da24ec..dd8281f9e 100644
--- a/common.hpp
+++ b/common.hpp
@@ -23,12 +23,12 @@ class DownSampleBlock : public GGMLBlock {
}
}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
if (vae_downsample) {
auto conv = std::dynamic_pointer_cast(blocks["conv"]);
- x = ggml_pad(ctx, x, 1, 1, 0, 0);
+ x = ggml_pad(ctx->ggml_ctx, x, 1, 1, 0, 0);
x = conv->forward(ctx, x);
} else {
auto conv = std::dynamic_pointer_cast(blocks["op"]);
@@ -52,12 +52,12 @@ class UpSampleBlock : public GGMLBlock {
blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [N, channels, h, w]
auto conv = std::dynamic_pointer_cast(blocks["conv"]);
- x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
- x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
+ x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
+ x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
return x;
}
};
@@ -121,7 +121,7 @@ class ResBlock : public GGMLBlock {
}
}
- virtual struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = NULL) {
+ virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
// For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
// [N, c, t, h, w] => [N, c, t, h * w]
// x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
@@ -131,38 +131,38 @@ class ResBlock : public GGMLBlock {
auto out_layers_0 = std::dynamic_pointer_cast(blocks["out_layers.0"]);
auto out_layers_3 = std::dynamic_pointer_cast(blocks["out_layers.3"]);
- if (emb == NULL) {
+ if (emb == nullptr) {
GGML_ASSERT(skip_t_emb);
}
// in_layers
auto h = in_layers_0->forward(ctx, x);
- h = ggml_silu_inplace(ctx, h);
+ h = ggml_silu_inplace(ctx->ggml_ctx, h);
h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
// emb_layers
if (!skip_t_emb) {
auto emb_layer_1 = std::dynamic_pointer_cast(blocks["emb_layers.1"]);
- auto emb_out = ggml_silu(ctx, emb);
+ auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels]
if (dims == 2) {
- emb_out = ggml_reshape_4d(ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
+ emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1]
} else {
- emb_out = ggml_reshape_4d(ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
+ emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1]
if (exchange_temb_dims) {
// emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
- emb_out = ggml_cont(ctx, ggml_permute(ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
+ emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1]
}
}
- h = ggml_add(ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+ h = ggml_add(ctx->ggml_ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
// out_layers
h = out_layers_0->forward(ctx, h);
- h = ggml_silu_inplace(ctx, h);
+ h = ggml_silu_inplace(ctx->ggml_ctx, h);
// dropout, skip for inference
h = out_layers_3->forward(ctx, h);
@@ -172,67 +172,91 @@ class ResBlock : public GGMLBlock {
x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
- h = ggml_add(ctx, h, x);
+ h = ggml_add(ctx->ggml_ctx, h, x);
return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
}
};
-class GEGLU : public GGMLBlock {
+class GEGLU : public UnaryBlock {
protected:
int64_t dim_in;
int64_t dim_out;
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
- enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
- enum ggml_type bias_wtype = GGML_TYPE_F32;
- params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
- params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
- }
-
public:
GEGLU(int64_t dim_in, int64_t dim_out)
- : dim_in(dim_in), dim_out(dim_out) {}
+ : dim_in(dim_in), dim_out(dim_out) {
+ blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out * 2));
+ }
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
// x: [ne3, ne2, ne1, dim_in]
// return: [ne3, ne2, ne1, dim_out]
- struct ggml_tensor* w = params["proj.weight"];
- struct ggml_tensor* b = params["proj.bias"];
+ auto proj = std::dynamic_pointer_cast(blocks["proj"]);
+
+ x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2]
+ auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0);
+ x = x_vec[0]; // [ne3, ne2, ne1, dim_out]
+ auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out]
- auto x_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], 0); // [dim_out, dim_in]
- auto x_b = ggml_view_1d(ctx, b, b->ne[0] / 2, 0); // [dim_out, dim_in]
- auto gate_w = ggml_view_2d(ctx, w, w->ne[0], w->ne[1] / 2, w->nb[1], w->nb[1] * w->ne[1] / 2); // [dim_out, ]
- auto gate_b = ggml_view_1d(ctx, b, b->ne[0] / 2, b->nb[0] * b->ne[0] / 2); // [dim_out, ]
+ gate = ggml_gelu_inplace(ctx->ggml_ctx, gate);
- auto x_in = x;
- x = ggml_nn_linear(ctx, x_in, x_w, x_b); // [ne3, ne2, ne1, dim_out]
- auto gate = ggml_nn_linear(ctx, x_in, gate_w, gate_b); // [ne3, ne2, ne1, dim_out]
+ x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out]
+
+ return x;
+ }
+};
- gate = ggml_gelu_inplace(ctx, gate);
+class GELU : public UnaryBlock {
+public:
+ GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
+ blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out, bias));
+ }
- x = ggml_mul(ctx, x, gate); // [ne3, ne2, ne1, dim_out]
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+ // x: [ne3, ne2, ne1, dim_in]
+ // return: [ne3, ne2, ne1, dim_out]
+ auto proj = std::dynamic_pointer_cast(blocks["proj"]);
+ x = proj->forward(ctx, x);
+ x = ggml_gelu_inplace(ctx->ggml_ctx, x);
return x;
}
};
class FeedForward : public GGMLBlock {
public:
+ enum class Activation {
+ GEGLU,
+ GELU
+ };
FeedForward(int64_t dim,
int64_t dim_out,
- int64_t mult = 4) {
+ int64_t mult = 4,
+ Activation activation = Activation::GEGLU,
+ bool precision_fix = false) {
int64_t inner_dim = dim * mult;
+ if (activation == Activation::GELU) {
+ blocks["net.0"] = std::shared_ptr(new GELU(dim, inner_dim));
+ } else {
+ blocks["net.0"] = std::shared_ptr(new GEGLU(dim, inner_dim));
+ }
- blocks["net.0"] = std::shared_ptr(new GEGLU(dim, inner_dim));
// net_1 is nn.Dropout(), skip for inference
- blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out));
+ float scale = 1.f;
+ if (precision_fix) {
+ scale = 1.f / 128.f;
+ }
+ // The purpose of the scale here is to prevent NaN issues in certain situations.
+ // For example, when using Vulkan without enabling force_prec_f32,
+ // or when using CUDA but the weights are k-quants.
+ blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out, true, false, false, scale));
}
- struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) {
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
// x: [ne3, ne2, ne1, dim]
// return: [ne3, ne2, ne1, dim_out]
- auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]);
+ auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]);
auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]);
x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim]
@@ -247,19 +271,16 @@ class CrossAttention : public GGMLBlock {
int64_t context_dim;
int64_t n_head;
int64_t d_head;
- bool flash_attn;
public:
CrossAttention(int64_t query_dim,
int64_t context_dim,
int64_t n_head,
- int64_t d_head,
- bool flash_attn = false)
+ int64_t d_head)
: n_head(n_head),
d_head(d_head),
query_dim(query_dim),
- context_dim(context_dim),
- flash_attn(flash_attn) {
+ context_dim(context_dim) {
int64_t inner_dim = d_head * n_head;
blocks["to_q"] = std::shared_ptr(new Linear(query_dim, inner_dim, false));
@@ -270,8 +291,7 @@ class CrossAttention : public GGMLBlock {
// to_out_1 is nn.Dropout(), skip for inference
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, n_token, query_dim]
@@ -291,7 +311,7 @@ class CrossAttention : public GGMLBlock {
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]
- x = ggml_nn_attention_ext(ctx, backend, q, k, v, n_head, NULL, false, false, flash_attn); // [N, n_token, inner_dim]
+ x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]
x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
return x;
@@ -309,16 +329,15 @@ class BasicTransformerBlock : public GGMLBlock {
int64_t n_head,
int64_t d_head,
int64_t context_dim,
- bool ff_in = false,
- bool flash_attn = false)
+ bool ff_in = false)
: n_head(n_head), d_head(d_head), ff_in(ff_in) {
// disable_self_attn is always False
// disable_temporal_crossattention is always False
// switch_temporal_ca_to_sa is always False
// inner_dim is always None or equal to dim
// gated_ff is always True
- blocks["attn1"] = std::shared_ptr(new CrossAttention(dim, dim, n_head, d_head, flash_attn));
- blocks["attn2"] = std::shared_ptr(new CrossAttention(dim, context_dim, n_head, d_head, flash_attn));
+ blocks["attn1"] = std::shared_ptr(new CrossAttention(dim, dim, n_head, d_head));
+ blocks["attn2"] = std::shared_ptr(new CrossAttention(dim, context_dim, n_head, d_head));
blocks["ff"] = std::shared_ptr(new FeedForward(dim, dim));
blocks["norm1"] = std::shared_ptr(new LayerNorm(dim));
blocks["norm2"] = std::shared_ptr(new LayerNorm(dim));
@@ -330,8 +349,7 @@ class BasicTransformerBlock : public GGMLBlock {
}
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, n_token, query_dim]
@@ -353,21 +371,21 @@ class BasicTransformerBlock : public GGMLBlock {
x = norm_in->forward(ctx, x);
x = ff_in->forward(ctx, x);
// self.is_res is always True
- x = ggml_add(ctx, x, x_skip);
+ x = ggml_add(ctx->ggml_ctx, x, x_skip);
}
auto r = x;
x = norm1->forward(ctx, x);
- x = attn1->forward(ctx, backend, x, x); // self-attention
- x = ggml_add(ctx, x, r);
+ x = attn1->forward(ctx, x, x); // self-attention
+ x = ggml_add(ctx->ggml_ctx, x, r);
r = x;
x = norm2->forward(ctx, x);
- x = attn2->forward(ctx, backend, x, context); // cross-attention
- x = ggml_add(ctx, x, r);
+ x = attn2->forward(ctx, x, context); // cross-attention
+ x = ggml_add(ctx->ggml_ctx, x, r);
r = x;
x = norm3->forward(ctx, x);
x = ff->forward(ctx, x);
- x = ggml_add(ctx, x, r);
+ x = ggml_add(ctx->ggml_ctx, x, r);
return x;
}
@@ -380,6 +398,23 @@ class SpatialTransformer : public GGMLBlock {
int64_t d_head;
int64_t depth = 1; // 1
int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2
+ bool use_linear = false;
+
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+ auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+ if (iter != tensor_storage_map.end()) {
+ int64_t inner_dim = n_head * d_head;
+ if (iter->second.n_dims == 4 && use_linear) {
+ use_linear = false;
+ blocks["proj_in"] = std::make_shared(in_channels, inner_dim, std::pair{1, 1});
+ blocks["proj_out"] = std::make_shared(inner_dim, in_channels, std::pair{1, 1});
+ } else if (iter->second.n_dims == 2 && !use_linear) {
+ use_linear = true;
+ blocks["proj_in"] = std::make_shared(in_channels, inner_dim);
+ blocks["proj_out"] = std::make_shared(inner_dim, in_channels);
+ }
+ }
+ }
public:
SpatialTransformer(int64_t in_channels,
@@ -387,35 +422,42 @@ class SpatialTransformer : public GGMLBlock {
int64_t d_head,
int64_t depth,
int64_t context_dim,
- bool flash_attn = false)
+ bool use_linear)
: in_channels(in_channels),
n_head(n_head),
d_head(d_head),
depth(depth),
- context_dim(context_dim) {
- // We will convert unet transformer linear to conv2d 1x1 when loading the weights, so use_linear is always False
+ context_dim(context_dim),
+ use_linear(use_linear) {
// disable_self_attn is always False
int64_t inner_dim = n_head * d_head; // in_channels
blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels));
- blocks["proj_in"] = std::shared_ptr(new Conv2d(in_channels, inner_dim, {1, 1}));
+ if (use_linear) {
+ blocks["proj_in"] = std::shared_ptr(new Linear(in_channels, inner_dim));
+ } else {
+ blocks["proj_in"] = std::shared_ptr(new Conv2d(in_channels, inner_dim, {1, 1}));
+ }
for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i);
- blocks[name] = std::shared_ptr(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false, flash_attn));
+ blocks[name] = std::shared_ptr(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
}
- blocks["proj_out"] = std::shared_ptr(new Conv2d(inner_dim, in_channels, {1, 1}));
+ if (use_linear) {
+ blocks["proj_out"] = std::shared_ptr(new Linear(inner_dim, in_channels));
+ } else {
+ blocks["proj_out"] = std::shared_ptr(new Conv2d(inner_dim, in_channels, {1, 1}));
+ }
}
- virtual struct ggml_tensor* forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
// x: [N, in_channels, h, w]
// context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
auto norm = std::dynamic_pointer_cast(blocks["norm"]);
- auto proj_in = std::dynamic_pointer_cast(blocks["proj_in"]);
- auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]);
+ auto proj_in = std::dynamic_pointer_cast(blocks["proj_in"]);
+ auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]);
auto x_in = x;
int64_t n = x->ne[3];
@@ -424,32 +466,45 @@ class SpatialTransformer : public GGMLBlock {
int64_t inner_dim = n_head * d_head;
x = norm->forward(ctx, x);
- x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
-
- x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
- x = ggml_reshape_3d(ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
+ if (use_linear) {
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
+ x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
+ x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
+ } else {
+ x = proj_in->forward(ctx, x); // [N, inner_dim, h, w]
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim]
+ x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
+ }
for (int i = 0; i < depth; i++) {
std::string name = "transformer_blocks." + std::to_string(i);
auto transformer_block = std::dynamic_pointer_cast(blocks[name]);
- x = transformer_block->forward(ctx, backend, x, context);
+ x = transformer_block->forward(ctx, x, context);
}
- x = ggml_cont(ctx, ggml_permute(ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
- x = ggml_reshape_4d(ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
+ if (use_linear) {
+ // proj_out
+ x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
+
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
+ } else {
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w]
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w]
- // proj_out
- x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
+ // proj_out
+ x = proj_out->forward(ctx, x); // [N, in_channels, h, w]
+ }
- x = ggml_add(ctx, x, x_in);
+ x = ggml_add(ctx->ggml_ctx, x, x_in);
return x;
}
};
class AlphaBlender : public GGMLBlock {
protected:
- void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
+ void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
enum ggml_type wtype = GGML_TYPE_F32;
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
@@ -458,7 +513,7 @@ class AlphaBlender : public GGMLBlock {
float get_alpha() {
// image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
// so learned_with_images is same as learned
- float alpha = ggml_backend_tensor_get_f32(params["mix_factor"]);
+ float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
return sigmoid(alpha);
}
@@ -469,14 +524,14 @@ class AlphaBlender : public GGMLBlock {
// since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x_spatial,
struct ggml_tensor* x_temporal) {
// image_only_indicator is always tensor([0.])
float alpha = get_alpha();
- auto x = ggml_add(ctx,
- ggml_scale(ctx, x_spatial, alpha),
- ggml_scale(ctx, x_temporal, 1.0f - alpha));
+ auto x = ggml_add(ctx->ggml_ctx,
+ ggml_scale(ctx->ggml_ctx, x_spatial, alpha),
+ ggml_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
return x;
}
};
@@ -494,7 +549,7 @@ class VideoResBlock : public ResBlock {
blocks["time_mixer"] = std::shared_ptr(new AlphaBlender());
}
- struct ggml_tensor* forward(struct ggml_context* ctx,
+ struct ggml_tensor* forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* emb,
int num_video_frames) {
@@ -512,18 +567,18 @@ class VideoResBlock : public ResBlock {
int64_t H = x->ne[1];
int64_t W = x->ne[0];
- x = ggml_reshape_4d(ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
- x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w)
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w)
auto x_mix = x;
- emb = ggml_reshape_4d(ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
+ emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ...
x = time_stack->forward(ctx, x, emb); // b t c (h w)
x = time_mixer->forward(ctx, x_mix, x); // b t c (h w)
- x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
- x = ggml_reshape_4d(ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
+ x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w)
+ x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w
return x;
}
diff --git a/conditioner.hpp b/conditioner.hpp
index cfd2b4ca7..27d367a9c 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -2,42 +2,48 @@
#define __CONDITIONER_HPP__
#include "clip.hpp"
+#include "qwenvl.hpp"
#include "t5.hpp"
struct SDCondition {
- struct ggml_tensor* c_crossattn = NULL; // aka context
- struct ggml_tensor* c_vector = NULL; // aka y
- struct ggml_tensor* c_concat = NULL;
+ struct ggml_tensor* c_crossattn = nullptr; // aka context
+ struct ggml_tensor* c_vector = nullptr; // aka y
+ struct ggml_tensor* c_concat = nullptr;
SDCondition() = default;
SDCondition(struct ggml_tensor* c_crossattn, struct ggml_tensor* c_vector, struct ggml_tensor* c_concat)
: c_crossattn(c_crossattn), c_vector(c_vector), c_concat(c_concat) {}
};
+struct ConditionerParams {
+ std::string text;
+ int clip_skip = -1;
+ int width = -1;
+ int height = -1;
+ int adm_in_channels = -1;
+ bool zero_out_masked = false;
+ int num_input_imgs = 0; // for photomaker
+ std::vector ref_images = {}; // for qwen image edit
+};
+
struct Conditioner {
virtual SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int adm_in_channels = -1,
- bool zero_out_masked = false) = 0;
- virtual void alloc_params_buffer() = 0;
- virtual void free_params_buffer() = 0;
- virtual void get_param_tensors(std::map& tensors) = 0;
- virtual size_t get_params_buffer_size() = 0;
+ const ConditionerParams& conditioner_params) = 0;
+ virtual void alloc_params_buffer() = 0;
+ virtual void free_params_buffer() = 0;
+ virtual void get_param_tensors(std::map& tensors) = 0;
+ virtual size_t get_params_buffer_size() = 0;
+ virtual void set_weight_adapter(const std::shared_ptr& adapter) {}
virtual std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int num_input_imgs,
- int adm_in_channels = -1,
- bool zero_out_masked = false) = 0;
+ const ConditionerParams& conditioner_params) {
+ GGML_ABORT("Not implemented yet!");
+ }
virtual std::string remove_trigger_from_prompt(ggml_context* work_ctx,
- const std::string& prompt) = 0;
+ const std::string& prompt) {
+ GGML_ABORT("Not implemented yet!");
+ }
};
// ldm.modules.encoders.modules.FrozenCLIPEmbedder
@@ -58,58 +64,44 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types,
+ const String2TensorStorage& tensor_storage_map,
const std::string& embd_dir,
SDVersion version = VERSION_SD1,
- PMVersion pv = PM_VERSION_1,
- int clip_skip = -1)
+ PMVersion pv = PM_VERSION_1)
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
+ bool force_clip_f32 = embd_dir.size() > 0;
if (sd_version_is_sd1(version)) {
- text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+ text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
} else if (sd_version_is_sd2(version)) {
- text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+ text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
} else if (sd_version_is_sdxl(version)) {
- text_model = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
- text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
- }
- set_clip_skip(clip_skip);
- }
-
- void set_clip_skip(int clip_skip) {
- if (clip_skip <= 0) {
- clip_skip = 1;
- if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
- clip_skip = 2;
- }
- }
- text_model->set_clip_skip(clip_skip);
- if (sd_version_is_sdxl(version)) {
- text_model2->set_clip_skip(clip_skip);
+ text_model = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+ text_model2 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
}
}
- void get_param_tensors(std::map& tensors) {
+ void get_param_tensors(std::map& tensors) override {
text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
if (sd_version_is_sdxl(version)) {
text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
}
}
- void alloc_params_buffer() {
+ void alloc_params_buffer() override {
text_model->alloc_params_buffer();
if (sd_version_is_sdxl(version)) {
text_model2->alloc_params_buffer();
}
}
- void free_params_buffer() {
+ void free_params_buffer() override {
text_model->free_params_buffer();
if (sd_version_is_sdxl(version)) {
text_model2->free_params_buffer();
}
}
- size_t get_params_buffer_size() {
+ size_t get_params_buffer_size() override {
size_t buffer_size = text_model->get_params_buffer_size();
if (sd_version_is_sdxl(version)) {
buffer_size += text_model2->get_params_buffer_size();
@@ -117,10 +109,17 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return buffer_size;
}
+ void set_weight_adapter(const std::shared_ptr& adapter) override {
+ text_model->set_weight_adapter(adapter);
+ if (sd_version_is_sdxl(version)) {
+ text_model2->set_weight_adapter(adapter);
+ }
+ }
+
bool load_embedding(std::string embd_name, std::string embd_path, std::vector& bpe_tokens) {
// the order matters
ModelLoader model_loader;
- if (!model_loader.init_from_file(embd_path)) {
+ if (!model_loader.init_from_file_and_convert_name(embd_path)) {
LOG_ERROR("embedding '%s' failed", embd_name.c_str());
return false;
}
@@ -129,12 +128,12 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
return true;
}
struct ggml_init_params params;
- params.mem_size = 10 * 1024 * 1024; // max for custom embeddings 10 MB
- params.mem_buffer = NULL;
+ params.mem_size = 100 * 1024 * 1024; // max for custom embeddings 100 MB
+ params.mem_buffer = nullptr;
params.no_alloc = false;
struct ggml_context* embd_ctx = ggml_init(params);
- struct ggml_tensor* embd = NULL;
- struct ggml_tensor* embd2 = NULL;
+ struct ggml_tensor* embd = nullptr;
+ struct ggml_tensor* embd2 = nullptr;
auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
if (tensor_storage.ne[0] != text_model->model.hidden_size) {
if (text_model2) {
@@ -155,7 +154,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
}
return true;
};
- model_loader.load_tensors(on_load);
+ model_loader.load_tensors(on_load, 1);
readed_embeddings.push_back(embd_name);
if (embd) {
int64_t hidden_size = text_model->model.hidden_size;
@@ -412,15 +411,18 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
int height,
int adm_in_channels = -1,
bool zero_out_masked = false) {
- set_clip_skip(clip_skip);
int64_t t0 = ggml_time_ms();
- struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
- struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
- struct ggml_tensor* chunk_hidden_states1 = NULL; // [n_token, hidden_size]
- struct ggml_tensor* chunk_hidden_states2 = NULL; // [n_token, hidden_size2]
- struct ggml_tensor* pooled = NULL;
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token, hidden_size]
+ struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
+ struct ggml_tensor* chunk_hidden_states1 = nullptr; // [n_token, hidden_size]
+ struct ggml_tensor* chunk_hidden_states2 = nullptr; // [n_token, hidden_size2]
+ struct ggml_tensor* pooled = nullptr;
std::vector hidden_states_vec;
+ if (clip_skip <= 0) {
+ clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
+ }
+
size_t chunk_len = 77;
size_t chunk_count = tokens.size() / chunk_len;
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
@@ -430,7 +432,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
weights.begin() + (chunk_idx + 1) * chunk_len);
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
- struct ggml_tensor* input_ids2 = NULL;
+ struct ggml_tensor* input_ids2 = nullptr;
size_t max_token_idx = 0;
if (sd_version_is_sdxl(version)) {
auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
@@ -455,6 +457,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
false,
+ clip_skip,
&chunk_hidden_states1,
work_ctx);
if (sd_version_is_sdxl(version)) {
@@ -464,9 +467,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
false,
+ clip_skip,
&chunk_hidden_states2, work_ctx);
// concat
- chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
+ chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
if (chunk_idx == 0) {
text_model2->compute(n_threads,
@@ -475,6 +479,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
token_embed_custom.data(),
max_token_idx,
true,
+ clip_skip,
&pooled,
work_ctx);
}
@@ -487,18 +492,18 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
ggml_tensor* result = ggml_dup_tensor(work_ctx, chunk_hidden_states);
{
- float original_mean = ggml_tensor_mean(chunk_hidden_states);
+ float original_mean = ggml_ext_tensor_mean(chunk_hidden_states);
for (int i2 = 0; i2 < chunk_hidden_states->ne[2]; i2++) {
for (int i1 = 0; i1 < chunk_hidden_states->ne[1]; i1++) {
for (int i0 = 0; i0 < chunk_hidden_states->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(chunk_hidden_states, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(result, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(result, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(result);
- ggml_tensor_scale(result, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(result);
+ ggml_ext_tensor_scale_inplace(result, (original_mean / new_mean));
}
if (zero_out_masked) {
float* vec = (float*)result->data;
@@ -515,7 +520,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
chunk_hidden_states->ne[0],
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
- ggml_tensor* vec = NULL;
+ ggml_tensor* vec = nullptr;
if (sd_version_is_sdxl(version)) {
int out_dim = 256;
vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
@@ -552,26 +557,20 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
GGML_ASSERT(offset == ggml_nbytes(vec));
}
// print_ggml_tensor(result);
- return SDCondition(hidden_states, vec, NULL);
+ return {hidden_states, vec, nullptr};
}
std::tuple>
get_learned_condition_with_trigger(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int num_input_imgs,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
+ const ConditionerParams& conditioner_params) override {
auto image_tokens = convert_token_to_id(trigger_word);
// if(image_tokens.size() == 1){
// printf(" image token id is: %d \n", image_tokens[0]);
// }
GGML_ASSERT(image_tokens.size() == 1);
- auto tokens_and_weights = tokenize_with_trigger_token(text,
- num_input_imgs,
+ auto tokens_and_weights = tokenize_with_trigger_token(conditioner_params.text,
+ conditioner_params.num_input_imgs,
image_tokens[0],
true);
std::vector& tokens = std::get<0>(tokens_and_weights);
@@ -585,12 +584,20 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
// for(int i = 0; i < clsm.size(); ++i)
// printf("%d ", clsm[i]?1:0);
// printf("\n");
- auto cond = get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, zero_out_masked);
+ auto cond = get_learned_condition_common(work_ctx,
+ n_threads,
+ tokens,
+ weights,
+ conditioner_params.clip_skip,
+ conditioner_params.width,
+ conditioner_params.height,
+ conditioner_params.adm_in_channels,
+ conditioner_params.zero_out_masked);
return std::make_tuple(cond, clsm);
}
std::string remove_trigger_from_prompt(ggml_context* work_ctx,
- const std::string& prompt) {
+ const std::string& prompt) override {
auto image_tokens = convert_token_to_id(trigger_word);
GGML_ASSERT(image_tokens.size() == 1);
auto tokens_and_weights = tokenize(prompt, false);
@@ -603,16 +610,19 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- auto tokens_and_weights = tokenize(text, true);
+ const ConditionerParams& conditioner_params) override {
+ auto tokens_and_weights = tokenize(conditioner_params.text, true);
std::vector& tokens = tokens_and_weights.first;
std::vector& weights = tokens_and_weights.second;
- return get_learned_condition_common(work_ctx, n_threads, tokens, weights, clip_skip, width, height, adm_in_channels, zero_out_masked);
+ return get_learned_condition_common(work_ctx,
+ n_threads,
+ tokens,
+ weights,
+ conditioner_params.clip_skip,
+ conditioner_params.width,
+ conditioner_params.height,
+ conditioner_params.adm_in_channels,
+ conditioner_params.zero_out_masked);
}
};
@@ -621,12 +631,24 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
FrozenCLIPVisionEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {})
- : vision_model(OPEN_CLIP_VIT_H_14), GGMLRunner(backend, offload_params_to_cpu) {
- vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
+ const String2TensorStorage& tensor_storage_map = {})
+ : GGMLRunner(backend, offload_params_to_cpu) {
+ std::string prefix = "cond_stage_model.transformer";
+ bool proj_in = false;
+ for (const auto& [name, tensor_storage] : tensor_storage_map) {
+ if (!starts_with(name, prefix)) {
+ continue;
+ }
+ if (contains(name, "self_attn.in_proj")) {
+ proj_in = true;
+ break;
+ }
+ }
+ vision_model = CLIPVisionModelProjection(OPEN_CLIP_VIT_H_14, false, proj_in);
+ vision_model.init(params_ctx, tensor_storage_map, prefix);
}
- std::string get_desc() {
+ std::string get_desc() override {
return "clip_vision";
}
@@ -639,7 +661,9 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
pixel_values = to_backend(pixel_values);
- struct ggml_tensor* hidden_states = vision_model.forward(compute_ctx, runtime_backend, pixel_values, return_pooled, clip_skip);
+ auto runner_ctx = get_context();
+
+ struct ggml_tensor* hidden_states = vision_model.forward(&runner_ctx, pixel_values, return_pooled, clip_skip);
ggml_build_forward_expand(gf, hidden_states);
@@ -669,46 +693,95 @@ struct SD3CLIPEmbedder : public Conditioner {
SD3CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {},
- int clip_skip = -1)
+ const String2TensorStorage& tensor_storage_map = {})
: clip_g_tokenizer(0) {
- clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
- clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
- t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
- set_clip_skip(clip_skip);
+ bool use_clip_l = false;
+ bool use_clip_g = false;
+ bool use_t5 = false;
+ for (auto pair : tensor_storage_map) {
+ if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
+ use_clip_l = true;
+ } else if (pair.first.find("text_encoders.clip_g") != std::string::npos) {
+ use_clip_g = true;
+ } else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
+ use_t5 = true;
+ }
+ }
+ if (!use_clip_l && !use_clip_g && !use_t5) {
+ LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
+ return;
+ }
+ if (use_clip_l) {
+ clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+ }
+ if (use_clip_g) {
+ clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+ }
+ if (use_t5) {
+ t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+ }
}
- void set_clip_skip(int clip_skip) {
- if (clip_skip <= 0) {
- clip_skip = 2;
+ void get_param_tensors(std::map& tensors) override {
+ if (clip_l) {
+ clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
+ }
+ if (clip_g) {
+ clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
+ }
+ if (t5) {
+ t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
}
- clip_l->set_clip_skip(clip_skip);
- clip_g->set_clip_skip(clip_skip);
}
- void get_param_tensors(std::map& tensors) {
- clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
- clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
- t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ void alloc_params_buffer() override {
+ if (clip_l) {
+ clip_l->alloc_params_buffer();
+ }
+ if (clip_g) {
+ clip_g->alloc_params_buffer();
+ }
+ if (t5) {
+ t5->alloc_params_buffer();
+ }
}
- void alloc_params_buffer() {
- clip_l->alloc_params_buffer();
- clip_g->alloc_params_buffer();
- t5->alloc_params_buffer();
+ void free_params_buffer() override {
+ if (clip_l) {
+ clip_l->free_params_buffer();
+ }
+ if (clip_g) {
+ clip_g->free_params_buffer();
+ }
+ if (t5) {
+ t5->free_params_buffer();
+ }
}
- void free_params_buffer() {
- clip_l->free_params_buffer();
- clip_g->free_params_buffer();
- t5->free_params_buffer();
+ size_t get_params_buffer_size() override {
+ size_t buffer_size = 0;
+ if (clip_l) {
+ buffer_size += clip_l->get_params_buffer_size();
+ }
+ if (clip_g) {
+ buffer_size += clip_g->get_params_buffer_size();
+ }
+ if (t5) {
+ buffer_size += t5->get_params_buffer_size();
+ }
+ return buffer_size;
}
- size_t get_params_buffer_size() {
- size_t buffer_size = clip_l->get_params_buffer_size();
- buffer_size += clip_g->get_params_buffer_size();
- buffer_size += t5->get_params_buffer_size();
- return buffer_size;
+ void set_weight_adapter(const std::shared_ptr& adapter) override {
+ if (clip_l) {
+ clip_l->set_weight_adapter(adapter);
+ }
+ if (clip_g) {
+ clip_g->set_weight_adapter(adapter);
+ }
+ if (t5) {
+ t5->set_weight_adapter(adapter);
+ }
}
std::vector, std::vector>> tokenize(std::string text,
@@ -739,23 +812,32 @@ struct SD3CLIPEmbedder : public Conditioner {
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
-
- std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
- clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
-
- curr_tokens = clip_g_tokenizer.encode(curr_text, on_new_token_cb);
- clip_g_tokens.insert(clip_g_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight);
-
- curr_tokens = t5_tokenizer.Encode(curr_text, true);
- t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ if (clip_l) {
+ std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
+ clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
+ }
+ if (clip_g) {
+ std::vector curr_tokens = clip_g_tokenizer.encode(curr_text, on_new_token_cb);
+ clip_g_tokens.insert(clip_g_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight);
+ }
+ if (t5) {
+ std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true);
+ t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ }
}
- clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding);
- clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding);
- t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding);
+ if (clip_l) {
+ clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding);
+ }
+ if (clip_g) {
+ clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding);
+ }
+ if (t5) {
+ t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding);
+ }
// for (int i = 0; i < clip_l_tokens.size(); i++) {
// std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
@@ -780,7 +862,6 @@ struct SD3CLIPEmbedder : public Conditioner {
std::vector, std::vector>> token_and_weights,
int clip_skip,
bool zero_out_masked = false) {
- set_clip_skip(clip_skip);
auto& clip_l_tokens = token_and_weights[0].first;
auto& clip_l_weights = token_and_weights[0].second;
auto& clip_g_tokens = token_and_weights[1].first;
@@ -788,22 +869,26 @@ struct SD3CLIPEmbedder : public Conditioner {
auto& t5_tokens = token_and_weights[2].first;
auto& t5_weights = token_and_weights[2].second;
+ if (clip_skip <= 0) {
+ clip_skip = 2;
+ }
+
int64_t t0 = ggml_time_ms();
- struct ggml_tensor* hidden_states = NULL; // [N, n_token*2, 4096]
- struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096]
- struct ggml_tensor* chunk_hidden_states_l = NULL; // [n_token, hidden_size_l]
- struct ggml_tensor* chunk_hidden_states_g = NULL; // [n_token, hidden_size_g]
- struct ggml_tensor* chunk_hidden_states_t5 = NULL; // [n_token, hidden_size_t5]
- struct ggml_tensor* pooled = NULL;
- struct ggml_tensor* pooled_l = NULL; // [768,]
- struct ggml_tensor* pooled_g = NULL; // [1280,]
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token*2, 4096]
+ struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token*2, 4096]
+ struct ggml_tensor* chunk_hidden_states_l = nullptr; // [n_token, hidden_size_l]
+ struct ggml_tensor* chunk_hidden_states_g = nullptr; // [n_token, hidden_size_g]
+ struct ggml_tensor* chunk_hidden_states_t5 = nullptr; // [n_token, hidden_size_t5]
+ struct ggml_tensor* pooled = nullptr;
+ struct ggml_tensor* pooled_l = nullptr; // [768,]
+ struct ggml_tensor* pooled_g = nullptr; // [1280,]
std::vector hidden_states_vec;
size_t chunk_len = 77;
- size_t chunk_count = clip_l_tokens.size() / chunk_len;
+ size_t chunk_count = std::max(std::max(clip_l_tokens.size(), clip_g_tokens.size()), t5_tokens.size()) / chunk_len;
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
// clip_l
- {
+ if (clip_l) {
std::vector chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len,
clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len,
@@ -815,25 +900,26 @@ struct SD3CLIPEmbedder : public Conditioner {
clip_l->compute(n_threads,
input_ids,
0,
- NULL,
+ nullptr,
max_token_idx,
false,
+ clip_skip,
&chunk_hidden_states_l,
work_ctx);
{
auto tensor = chunk_hidden_states_l;
- float original_mean = ggml_tensor_mean(tensor);
+ float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(tensor);
- ggml_tensor_scale(tensor, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
if (chunk_idx == 0) {
@@ -842,16 +928,24 @@ struct SD3CLIPEmbedder : public Conditioner {
clip_l->compute(n_threads,
input_ids,
0,
- NULL,
+ nullptr,
max_token_idx,
true,
+ clip_skip,
&pooled_l,
work_ctx);
}
+ } else {
+ chunk_hidden_states_l = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, chunk_len);
+ ggml_set_f32(chunk_hidden_states_l, 0.f);
+ if (chunk_idx == 0) {
+ pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
+ ggml_set_f32(pooled_l, 0.f);
+ }
}
// clip_g
- {
+ if (clip_g) {
std::vector chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len,
clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len,
@@ -863,26 +957,27 @@ struct SD3CLIPEmbedder : public Conditioner {
clip_g->compute(n_threads,
input_ids,
0,
- NULL,
+ nullptr,
max_token_idx,
false,
+ clip_skip,
&chunk_hidden_states_g,
work_ctx);
{
auto tensor = chunk_hidden_states_g;
- float original_mean = ggml_tensor_mean(tensor);
+ float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(tensor);
- ggml_tensor_scale(tensor, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
if (chunk_idx == 0) {
@@ -891,16 +986,24 @@ struct SD3CLIPEmbedder : public Conditioner {
clip_g->compute(n_threads,
input_ids,
0,
- NULL,
+ nullptr,
max_token_idx,
true,
+ clip_skip,
&pooled_g,
work_ctx);
}
+ } else {
+ chunk_hidden_states_g = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 1280, chunk_len);
+ ggml_set_f32(chunk_hidden_states_g, 0.f);
+ if (chunk_idx == 0) {
+ pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280);
+ ggml_set_f32(pooled_g, 0.f);
+ }
}
// t5
- {
+ if (t5) {
std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
@@ -910,24 +1013,27 @@ struct SD3CLIPEmbedder : public Conditioner {
t5->compute(n_threads,
input_ids,
- NULL,
+ nullptr,
&chunk_hidden_states_t5,
work_ctx);
{
auto tensor = chunk_hidden_states_t5;
- float original_mean = ggml_tensor_mean(tensor);
+ float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(tensor);
- ggml_tensor_scale(tensor, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
+ } else {
+ chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
+ ggml_set_f32(chunk_hidden_states_t5, 0.f);
}
auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx,
@@ -941,19 +1047,19 @@ struct SD3CLIPEmbedder : public Conditioner {
for (int i0 = 0; i0 < chunk_hidden_states_lg_pad->ne[0]; i0++) {
float value = 0.f;
if (i0 < chunk_hidden_states_l->ne[0]) {
- value = ggml_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2);
+ value = ggml_ext_tensor_get_f32(chunk_hidden_states_l, i0, i1, i2);
} else if (i0 < chunk_hidden_states_l->ne[0] + chunk_hidden_states_g->ne[0]) {
- value = ggml_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2);
+ value = ggml_ext_tensor_get_f32(chunk_hidden_states_g, i0 - chunk_hidden_states_l->ne[0], i1, i2);
}
- ggml_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(chunk_hidden_states_lg_pad, value, i0, i1, i2);
}
}
}
- chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096]
+ chunk_hidden_states = ggml_ext_tensor_concat(work_ctx, chunk_hidden_states_lg_pad, chunk_hidden_states_t5, 1); // [n_token*2, 4096]
if (chunk_idx == 0) {
- pooled = ggml_tensor_concat(work_ctx, pooled_l, pooled_g, 0); // [768 + 1280]
+ pooled = ggml_ext_tensor_concat(work_ctx, pooled_l, pooled_g, 0); // [768 + 1280]
}
int64_t t1 = ggml_time_ms();
@@ -970,41 +1076,32 @@ struct SD3CLIPEmbedder : public Conditioner {
((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
}
- hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
- hidden_states = ggml_reshape_2d(work_ctx,
- hidden_states,
- chunk_hidden_states->ne[0],
- ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
- return SDCondition(hidden_states, pooled, NULL);
+ if (hidden_states_vec.size() > 0) {
+ hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
+ hidden_states = ggml_reshape_2d(work_ctx,
+ hidden_states,
+ chunk_hidden_states->ne[0],
+ ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
+ } else {
+ hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
+ ggml_set_f32(hidden_states, 0.f);
+ }
+ if (pooled == nullptr) {
+ pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2048);
+ ggml_set_f32(pooled, 0.f);
+ }
+ return {hidden_states, pooled, nullptr};
}
SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- auto tokens_and_weights = tokenize(text, 77, true);
- return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
- }
-
- std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
- int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int num_input_imgs,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- GGML_ASSERT(0 && "Not implemented yet!");
- }
-
- std::string remove_trigger_from_prompt(ggml_context* work_ctx,
- const std::string& prompt) {
- GGML_ASSERT(0 && "Not implemented yet!");
+ const ConditionerParams& conditioner_params) override {
+ auto tokens_and_weights = tokenize(conditioner_params.text, 77, true);
+ return get_learned_condition_common(work_ctx,
+ n_threads,
+ tokens_and_weights,
+ conditioner_params.clip_skip,
+ conditioner_params.zero_out_masked);
}
};
@@ -1017,41 +1114,81 @@ struct FluxCLIPEmbedder : public Conditioner {
FluxCLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {},
- int clip_skip = -1) {
- clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
- t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
- set_clip_skip(clip_skip);
- }
+ const String2TensorStorage& tensor_storage_map = {}) {
+ bool use_clip_l = false;
+ bool use_t5 = false;
+ for (auto pair : tensor_storage_map) {
+ if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
+ use_clip_l = true;
+ } else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
+ use_t5 = true;
+ }
+ }
- void set_clip_skip(int clip_skip) {
- if (clip_skip <= 0) {
- clip_skip = 2;
+ if (!use_clip_l && !use_t5) {
+ LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
+ return;
+ }
+
+ if (use_clip_l) {
+ clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
+ } else {
+ LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
+ }
+ if (use_t5) {
+ t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+ } else {
+ LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
}
- clip_l->set_clip_skip(clip_skip);
}
- void get_param_tensors(std::map& tensors) {
- clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
- t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ void get_param_tensors(std::map& tensors) override {
+ if (clip_l) {
+ clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
+ }
+ if (t5) {
+ t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ }
}
- void alloc_params_buffer() {
- clip_l->alloc_params_buffer();
- t5->alloc_params_buffer();
+ void alloc_params_buffer() override {
+ if (clip_l) {
+ clip_l->alloc_params_buffer();
+ }
+ if (t5) {
+ t5->alloc_params_buffer();
+ }
}
- void free_params_buffer() {
- clip_l->free_params_buffer();
- t5->free_params_buffer();
+ void free_params_buffer() override {
+ if (clip_l) {
+ clip_l->free_params_buffer();
+ }
+ if (t5) {
+ t5->free_params_buffer();
+ }
}
- size_t get_params_buffer_size() {
- size_t buffer_size = clip_l->get_params_buffer_size();
- buffer_size += t5->get_params_buffer_size();
+ size_t get_params_buffer_size() override {
+ size_t buffer_size = 0;
+ if (clip_l) {
+ buffer_size += clip_l->get_params_buffer_size();
+ }
+ if (t5) {
+ buffer_size += t5->get_params_buffer_size();
+ }
return buffer_size;
}
+ void set_weight_adapter(const std::shared_ptr& adapter) {
+ if (clip_l) {
+ clip_l->set_weight_adapter(adapter);
+ }
+ if (t5) {
+ t5->set_weight_adapter(adapter);
+ }
+ }
+
std::vector, std::vector>> tokenize(std::string text,
size_t max_length = 0,
bool padding = false) {
@@ -1078,18 +1215,24 @@ struct FluxCLIPEmbedder : public Conditioner {
for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
-
- std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
- clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
-
- curr_tokens = t5_tokenizer.Encode(curr_text, true);
- t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ if (clip_l) {
+ std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb);
+ clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight);
+ }
+ if (t5) {
+ std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true);
+ t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ }
}
- clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding);
- t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding);
+ if (clip_l) {
+ clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding);
+ }
+ if (t5) {
+ t5_tokenizer.pad_tokens(t5_tokens, t5_weights, nullptr, max_length, padding);
+ }
// for (int i = 0; i < clip_l_tokens.size(); i++) {
// std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", ";
@@ -1109,46 +1252,52 @@ struct FluxCLIPEmbedder : public Conditioner {
std::vector, std::vector>> token_and_weights,
int clip_skip,
bool zero_out_masked = false) {
- set_clip_skip(clip_skip);
auto& clip_l_tokens = token_and_weights[0].first;
auto& clip_l_weights = token_and_weights[0].second;
auto& t5_tokens = token_and_weights[1].first;
auto& t5_weights = token_and_weights[1].second;
+ if (clip_skip <= 0) {
+ clip_skip = 2;
+ }
+
int64_t t0 = ggml_time_ms();
- struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
- struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096]
- struct ggml_tensor* pooled = NULL; // [768,]
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096]
+ struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096]
+ struct ggml_tensor* pooled = nullptr; // [768,]
std::vector hidden_states_vec;
- size_t chunk_count = t5_tokens.size() / chunk_len;
+ size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len;
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
// clip_l
if (chunk_idx == 0) {
- size_t chunk_len_l = 77;
- std::vector chunk_tokens(clip_l_tokens.begin(),
- clip_l_tokens.begin() + chunk_len_l);
- std::vector chunk_weights(clip_l_weights.begin(),
- clip_l_weights.begin() + chunk_len_l);
+ if (clip_l) {
+ size_t chunk_len_l = 77;
+ std::vector chunk_tokens(clip_l_tokens.begin(),
+ clip_l_tokens.begin() + chunk_len_l);
+ std::vector chunk_weights(clip_l_weights.begin(),
+ clip_l_weights.begin() + chunk_len_l);
- auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
- size_t max_token_idx = 0;
+ auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
+ size_t max_token_idx = 0;
- auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
- max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+ auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
+ max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
- clip_l->compute(n_threads,
- input_ids,
- 0,
- NULL,
- max_token_idx,
- true,
- &pooled,
- work_ctx);
+ clip_l->compute(n_threads,
+ input_ids,
+ 0,
+ nullptr,
+ max_token_idx,
+ true,
+ clip_skip,
+ &pooled,
+ work_ctx);
+ }
}
// t5
- {
+ if (t5) {
std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len,
t5_tokens.begin() + (chunk_idx + 1) * chunk_len);
std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len,
@@ -1158,24 +1307,27 @@ struct FluxCLIPEmbedder : public Conditioner {
t5->compute(n_threads,
input_ids,
- NULL,
+ nullptr,
&chunk_hidden_states,
work_ctx);
{
auto tensor = chunk_hidden_states;
- float original_mean = ggml_tensor_mean(tensor);
+ float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(tensor);
- ggml_tensor_scale(tensor, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
+ } else {
+ chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len);
+ ggml_set_f32(chunk_hidden_states, 0.f);
}
int64_t t1 = ggml_time_ms();
@@ -1192,41 +1344,32 @@ struct FluxCLIPEmbedder : public Conditioner {
((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states));
}
- hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
- hidden_states = ggml_reshape_2d(work_ctx,
- hidden_states,
- chunk_hidden_states->ne[0],
- ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
- return SDCondition(hidden_states, pooled, NULL);
+ if (hidden_states_vec.size() > 0) {
+ hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec);
+ hidden_states = ggml_reshape_2d(work_ctx,
+ hidden_states,
+ chunk_hidden_states->ne[0],
+ ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
+ } else {
+ hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
+ ggml_set_f32(hidden_states, 0.f);
+ }
+ if (pooled == nullptr) {
+ pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
+ ggml_set_f32(pooled, 0.f);
+ }
+ return {hidden_states, pooled, nullptr};
}
SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- auto tokens_and_weights = tokenize(text, chunk_len, true);
- return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
- }
-
- std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
- int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int num_input_imgs,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- GGML_ASSERT(0 && "Not implemented yet!");
- }
-
- std::string remove_trigger_from_prompt(ggml_context* work_ctx,
- const std::string& prompt) {
- GGML_ASSERT(0 && "Not implemented yet!");
+ const ConditionerParams& conditioner_params) override {
+ auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
+ return get_learned_condition_common(work_ctx,
+ n_threads,
+ tokens_and_weights,
+ conditioner_params.clip_skip,
+ conditioner_params.zero_out_masked);
}
};
@@ -1240,38 +1383,58 @@ struct T5CLIPEmbedder : public Conditioner {
T5CLIPEmbedder(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {},
- int clip_skip = -1,
- bool use_mask = false,
- int mask_pad = 1,
- bool is_umt5 = false)
+ const String2TensorStorage& tensor_storage_map = {},
+ bool use_mask = false,
+ int mask_pad = 1,
+ bool is_umt5 = false)
: use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
- t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
- }
+ bool use_t5 = false;
+ for (auto pair : tensor_storage_map) {
+ if (pair.first.find("text_encoders.t5xxl") != std::string::npos) {
+ use_t5 = true;
+ }
+ }
- void set_clip_skip(int clip_skip) {
+ if (!use_t5) {
+ LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!");
+ return;
+ } else {
+ t5 = std::make_shared(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer", is_umt5);
+ }
}
- void get_param_tensors(std::map& tensors) {
- t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ void get_param_tensors(std::map& tensors) override {
+ if (t5) {
+ t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+ }
}
- void alloc_params_buffer() {
- t5->alloc_params_buffer();
+ void alloc_params_buffer() override {
+ if (t5) {
+ t5->alloc_params_buffer();
+ }
}
- void free_params_buffer() {
- t5->free_params_buffer();
+ void free_params_buffer() override {
+ if (t5) {
+ t5->free_params_buffer();
+ }
}
- size_t get_params_buffer_size() {
+ size_t get_params_buffer_size() override {
size_t buffer_size = 0;
-
- buffer_size += t5->get_params_buffer_size();
-
+ if (t5) {
+ buffer_size += t5->get_params_buffer_size();
+ }
return buffer_size;
}
+ void set_weight_adapter(const std::shared_ptr& adapter) override {
+ if (t5) {
+ t5->set_weight_adapter(adapter);
+ }
+ }
+
std::tuple, std::vector, std::vector> tokenize(std::string text,
size_t max_length = 0,
bool padding = false) {
@@ -1294,17 +1457,18 @@ struct T5CLIPEmbedder : public Conditioner {
std::vector t5_tokens;
std::vector t5_weights;
std::vector t5_mask;
- for (const auto& item : parsed_attention) {
- const std::string& curr_text = item.first;
- float curr_weight = item.second;
-
- std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true);
- t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
- t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
- }
+ if (t5) {
+ for (const auto& item : parsed_attention) {
+ const std::string& curr_text = item.first;
+ float curr_weight = item.second;
- t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding);
+ std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true);
+ t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+ }
+ t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding);
+ }
return {t5_tokens, t5_weights, t5_mask};
}
@@ -1328,14 +1492,21 @@ struct T5CLIPEmbedder : public Conditioner {
std::tuple, std::vector, std::vector> token_and_weights,
int clip_skip,
bool zero_out_masked = false) {
+ if (!t5) {
+ auto hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256);
+ ggml_set_f32(hidden_states, 0.f);
+ auto t5_attn_mask = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 256);
+ ggml_set_f32(t5_attn_mask, -HUGE_VALF);
+ return {hidden_states, t5_attn_mask, nullptr};
+ }
auto& t5_tokens = std::get<0>(token_and_weights);
auto& t5_weights = std::get<1>(token_and_weights);
auto& t5_attn_mask_vec = std::get<2>(token_and_weights);
int64_t t0 = ggml_time_ms();
- struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
- struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096]
- struct ggml_tensor* pooled = NULL;
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 4096]
+ struct ggml_tensor* chunk_hidden_states = nullptr; // [n_token, 4096]
+ struct ggml_tensor* pooled = nullptr;
struct ggml_tensor* t5_attn_mask = vector_to_ggml_tensor(work_ctx, t5_attn_mask_vec); // [n_token]
std::vector hidden_states_vec;
@@ -1352,7 +1523,7 @@ struct T5CLIPEmbedder : public Conditioner {
t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len);
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
- auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : NULL;
+ auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : nullptr;
t5->compute(n_threads,
input_ids,
@@ -1361,18 +1532,18 @@ struct T5CLIPEmbedder : public Conditioner {
work_ctx);
{
auto tensor = chunk_hidden_states;
- float original_mean = ggml_tensor_mean(tensor);
+ float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
- float value = ggml_tensor_get_f32(tensor, i0, i1, i2);
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= chunk_weights[i1];
- ggml_tensor_set_f32(tensor, value, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
- float new_mean = ggml_tensor_mean(tensor);
- ggml_tensor_scale(tensor, (original_mean / new_mean));
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
int64_t t1 = ggml_time_ms();
@@ -1383,7 +1554,7 @@ struct T5CLIPEmbedder : public Conditioner {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
if (chunk_mask[i1] < 0.f) {
- ggml_tensor_set_f32(tensor, 0.f, i0, i1, i2);
+ ggml_ext_tensor_set_f32(tensor, 0.f, i0, i1, i2);
}
}
}
@@ -1404,36 +1575,225 @@ struct T5CLIPEmbedder : public Conditioner {
modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad);
- return SDCondition(hidden_states, t5_attn_mask, NULL);
+ return {hidden_states, t5_attn_mask, nullptr};
}
SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- auto tokens_and_weights = tokenize(text, chunk_len, true);
- return get_learned_condition_common(work_ctx, n_threads, tokens_and_weights, clip_skip, zero_out_masked);
+ const ConditionerParams& conditioner_params) override {
+ auto tokens_and_weights = tokenize(conditioner_params.text, chunk_len, true);
+ return get_learned_condition_common(work_ctx,
+ n_threads,
+ tokens_and_weights,
+ conditioner_params.clip_skip,
+ conditioner_params.zero_out_masked);
}
+};
- std::tuple> get_learned_condition_with_trigger(ggml_context* work_ctx,
- int n_threads,
- const std::string& text,
- int clip_skip,
- int width,
- int height,
- int num_input_imgs,
- int adm_in_channels = -1,
- bool zero_out_masked = false) {
- GGML_ASSERT(0 && "Not implemented yet!");
+struct Qwen2_5_VLCLIPEmbedder : public Conditioner {
+ Qwen::Qwen2Tokenizer tokenizer;
+ std::shared_ptr qwenvl;
+
+ Qwen2_5_VLCLIPEmbedder(ggml_backend_t backend,
+ bool offload_params_to_cpu,
+ const String2TensorStorage& tensor_storage_map = {},
+ const std::string prefix = "",
+ bool enable_vision = false) {
+ qwenvl = std::make_shared(backend,
+ offload_params_to_cpu,
+ tensor_storage_map,
+ "text_encoders.qwen2vl",
+ enable_vision);
}
- std::string remove_trigger_from_prompt(ggml_context* work_ctx,
- const std::string& prompt) {
- GGML_ASSERT(0 && "Not implemented yet!");
+ void get_param_tensors(std::map& tensors) override {
+ qwenvl->get_param_tensors(tensors, "text_encoders.qwen2vl");
+ }
+
+ void alloc_params_buffer() override {
+ qwenvl->alloc_params_buffer();
+ }
+
+ void free_params_buffer() override {
+ qwenvl->free_params_buffer();
+ }
+
+ size_t get_params_buffer_size() override {
+ size_t buffer_size = 0;
+ buffer_size += qwenvl->get_params_buffer_size();
+ return buffer_size;
+ }
+
+ void set_weight_adapter(const std::shared_ptr& adapter) override {
+ if (qwenvl) {
+ qwenvl->set_weight_adapter(adapter);
+ }
+ }
+
+ std::tuple, std::vector> tokenize(std::string text,
+ size_t max_length = 0,
+ size_t system_prompt_length = 0,
+ bool padding = false) {
+ std::vector> parsed_attention;
+ if (system_prompt_length > 0) {
+ parsed_attention.emplace_back(text.substr(0, system_prompt_length), 1.f);
+ auto new_parsed_attention = parse_prompt_attention(text.substr(system_prompt_length, text.size() - system_prompt_length));
+ parsed_attention.insert(parsed_attention.end(),
+ new_parsed_attention.begin(),
+ new_parsed_attention.end());
+ } else {
+ parsed_attention = parse_prompt_attention(text);
+ }
+
+ {
+ std::stringstream ss;
+ ss << "[";
+ for (const auto& item : parsed_attention) {
+ ss << "['" << item.first << "', " << item.second << "], ";
+ }
+ ss << "]";
+ LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+ }
+
+ std::vector tokens;
+ std::vector weights;
+ for (const auto& item : parsed_attention) {
+ const std::string& curr_text = item.first;
+ float curr_weight = item.second;
+ std::vector curr_tokens = tokenizer.tokenize(curr_text, nullptr);
+ tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+ weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+ }
+
+ tokenizer.pad_tokens(tokens, weights, max_length, padding);
+
+ // for (int i = 0; i < tokens.size(); i++) {
+ // std::cout << tokens[i] << ":" << weights[i] << ", " << i << std::endl;
+ // }
+ // std::cout << std::endl;
+
+ return {tokens, weights};
+ }
+
+ SDCondition get_learned_condition(ggml_context* work_ctx,
+ int n_threads,
+ const ConditionerParams& conditioner_params) override {
+ std::string prompt;
+ std::vector> image_embeds;
+ size_t system_prompt_length = 0;
+ int prompt_template_encode_start_idx = 34;
+ if (qwenvl->enable_vision && conditioner_params.ref_images.size() > 0) {
+ LOG_INFO("QwenImageEditPlusPipeline");
+ prompt_template_encode_start_idx = 64;
+ int image_embed_idx = 64 + 6;
+
+ int min_pixels = 384 * 384;
+ int max_pixels = 560 * 560;
+ std::string placeholder = "<|image_pad|>";
+ std::string img_prompt;
+
+ for (int i = 0; i < conditioner_params.ref_images.size(); i++) {
+ sd_image_f32_t image = sd_image_t_to_sd_image_f32_t(*conditioner_params.ref_images[i]);
+ double factor = qwenvl->params.vision.patch_size * qwenvl->params.vision.spatial_merge_size;
+ int height = image.height;
+ int width = image.width;
+ int h_bar = static_cast(std::round(height / factor)) * factor;
+ int w_bar = static_cast(std::round(width / factor)) * factor;
+
+ if (static_cast(h_bar) * w_bar > max_pixels) {
+ double beta = std::sqrt((height * width) / static_cast(max_pixels));
+ h_bar = std::max(static_cast(factor),
+ static_cast(std::floor(height / beta / factor)) * static_cast(factor));
+ w_bar = std::max(static_cast(factor),
+ static_cast(std::floor(width / beta / factor)) * static_cast(factor));
+ } else if (static_cast(h_bar) * w_bar < min_pixels) {
+ double beta = std::sqrt(static_cast(min_pixels) / (height * width));
+ h_bar = static_cast(std::ceil(height * beta / factor)) * static_cast(factor);
+ w_bar = static_cast(std::ceil(width * beta / factor)) * static_cast(factor);
+ }
+
+ LOG_DEBUG("resize conditioner ref image %d from %dx%d to %dx%d", i, image.height, image.width, h_bar, w_bar);
+
+ sd_image_f32_t resized_image = clip_preprocess(image, w_bar, h_bar);
+ free(image.data);
+ image.data = nullptr;
+
+ ggml_tensor* image_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, resized_image.width, resized_image.height, 3, 1);
+ sd_image_f32_to_ggml_tensor(resized_image, image_tensor, false);
+ free(resized_image.data);
+ resized_image.data = nullptr;
+
+ ggml_tensor* image_embed = nullptr;
+ qwenvl->encode_image(n_threads, image_tensor, &image_embed, work_ctx);
+ image_embeds.emplace_back(image_embed_idx, image_embed);
+ image_embed_idx += 1 + image_embed->ne[1] + 6;
+
+ img_prompt += "Picture " + std::to_string(i + 1) + ": <|vision_start|>"; // [24669, 220, index, 25, 220, 151652]
+ int64_t num_image_tokens = image_embed->ne[1];
+ img_prompt.reserve(num_image_tokens * placeholder.size());
+ for (int j = 0; j < num_image_tokens; j++) {
+ img_prompt += placeholder;
+ }
+ img_prompt += "<|vision_end|>";
+ }
+
+ prompt = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n";
+
+ system_prompt_length = prompt.size();
+
+ prompt += img_prompt;
+ prompt += conditioner_params.text;
+ prompt += "<|im_end|>\n<|im_start|>assistant\n";
+ } else {
+ prompt = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n" + conditioner_params.text + "<|im_end|>\n<|im_start|>assistant\n";
+ }
+
+ auto tokens_and_weights = tokenize(prompt, 0, system_prompt_length, false);
+ auto& tokens = std::get<0>(tokens_and_weights);
+ auto& weights = std::get<1>(tokens_and_weights);
+
+ int64_t t0 = ggml_time_ms();
+ struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 3584]
+
+ auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens);
+
+ qwenvl->compute(n_threads,
+ input_ids,
+ image_embeds,
+ &hidden_states,
+ work_ctx);
+ {
+ auto tensor = hidden_states;
+ float original_mean = ggml_ext_tensor_mean(tensor);
+ for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+ for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+ for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+ float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
+ value *= weights[i1];
+ ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
+ }
+ }
+ }
+ float new_mean = ggml_ext_tensor_mean(tensor);
+ ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
+ }
+
+ GGML_ASSERT(hidden_states->ne[1] > prompt_template_encode_start_idx);
+
+ ggml_tensor* new_hidden_states = ggml_new_tensor_3d(work_ctx,
+ GGML_TYPE_F32,
+ hidden_states->ne[0],
+ hidden_states->ne[1] - prompt_template_encode_start_idx,
+ hidden_states->ne[2]);
+
+ ggml_ext_tensor_iter(new_hidden_states, [&](ggml_tensor* new_hidden_states, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+ float value = ggml_ext_tensor_get_f32(hidden_states, i0, i1 + prompt_template_encode_start_idx, i2, i3);
+ ggml_ext_tensor_set_f32(new_hidden_states, value, i0, i1, i2, i3);
+ });
+
+ int64_t t1 = ggml_time_ms();
+ LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
+ return {new_hidden_states, nullptr, nullptr};
}
};
diff --git a/control.hpp b/control.hpp
index f9a492354..d86f64cbf 100644
--- a/control.hpp
+++ b/control.hpp
@@ -27,6 +27,7 @@ class ControlNetBlock : public GGMLBlock {
int num_heads = 8;
int num_head_channels = -1; // channels // num_heads
int context_dim = 768; // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+ bool use_linear_projection = false;
public:
int model_channels = 320;
@@ -82,7 +83,7 @@ class ControlNetBlock : public GGMLBlock {
int64_t d_head,
int64_t depth,
int64_t context_dim) -> SpatialTransformer* {
- return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim);
+ return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
};
auto make_zero_conv = [&](int64_t channels) {
@@ -165,7 +166,7 @@ class ControlNetBlock : public GGMLBlock {
}
struct ggml_tensor* resblock_forward(std::string name,
- struct ggml_context* ctx,
+ GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* emb) {
auto block = std::dynamic_pointer_cast(blocks[name]);
@@ -173,15 +174,14 @@ class ControlNetBlock : public GGMLBlock {
}
struct ggml_tensor* attention_layer_forward(std::string name,
- struct ggml_context* ctx,
- ggml_backend_t backend,
+ GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* context) {
auto block = std::dynamic_pointer_cast(blocks[name]);
- return block->forward(ctx, backend, x, context);
+ return block->forward(ctx, x, context);
}
- struct ggml_tensor* input_hint_block_forward(struct ggml_context* ctx,
+ struct ggml_tensor* input_hint_block_forward(GGMLRunnerContext* ctx,
struct ggml_tensor* hint,
struct ggml_tensor* emb,
struct ggml_tensor* context) {
@@ -193,33 +193,32 @@ class ControlNetBlock : public GGMLBlock {
h = block->forward(ctx, h);
} else {
- h = ggml_silu_inplace(ctx, h);
+ h = ggml_silu_inplace(ctx->ggml_ctx, h);
}
}
return h;
}
- std::vector forward(struct ggml_context* ctx,
- ggml_backend_t backend,
+ std::vector forward(GGMLRunnerContext* ctx,
struct ggml_tensor* x,
struct ggml_tensor* hint,
struct ggml_tensor* guided_hint,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
- struct ggml_tensor* y = NULL) {
+ struct ggml_tensor* y = nullptr) {
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
// timesteps: [N,]
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
// y: [N, adm_in_channels] or [1, adm_in_channels]
- if (context != NULL) {
+ if (context != nullptr) {
if (context->ne[2] != x->ne[3]) {
- context = ggml_repeat(ctx, context, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
+ context = ggml_repeat(ctx->ggml_ctx, context, ggml_new_tensor_3d(ctx->ggml_ctx, GGML_TYPE_F32, context->ne[0], context->ne[1], x->ne[3]));
}
}
- if (y != NULL) {
+ if (y != nullptr) {
if (y->ne[1] != x->ne[3]) {
- y = ggml_repeat(ctx, y, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
+ y = ggml_repeat(ctx->ggml_ctx, y, ggml_new_tensor_2d(ctx->ggml_ctx, GGML_TYPE_F32, y->ne[0], x->ne[3]));
}
}
@@ -230,27 +229,27 @@ class ControlNetBlock : public GGMLBlock {
auto middle_block_out = std::dynamic_pointer_cast(blocks["middle_block_out.0"]);
- auto t_emb = ggml_nn_timestep_embedding(ctx, timesteps, model_channels); // [N, model_channels]
+ auto t_emb = ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, model_channels); // [N, model_channels]
auto emb = time_embed_0->forward(ctx, t_emb);
- emb = ggml_silu_inplace(ctx, emb);
+ emb = ggml_silu_inplace(ctx->ggml_ctx, emb);
emb = time_embed_2->forward(ctx, emb); // [N, time_embed_dim]
// SDXL/SVD
- if (y != NULL) {
+ if (y != nullptr) {
auto label_embed_0 = std::dynamic_pointer_cast(blocks["label_emb.0.0"]);
auto label_embed_2 = std::dynamic_pointer_cast(blocks["label_emb.0.2"]);
auto label_emb = label_embed_0->forward(ctx, y);
- label_emb = ggml_silu_inplace(ctx, label_emb);
+ label_emb = ggml_silu_inplace(ctx->ggml_ctx, label_emb);
label_emb = label_embed_2->forward(ctx, label_emb); // [N, time_embed_dim]
- emb = ggml_add(ctx, emb, label_emb); // [N, time_embed_dim]
+ emb = ggml_add(ctx->ggml_ctx, emb, label_emb); // [N, time_embed_dim]
}
std::vector outs;
- if (guided_hint == NULL) {
+ if (guided_hint == nullptr) {
guided_hint = input_hint_block_forward(ctx, hint, emb, context);
}
outs.push_back(guided_hint);
@@ -259,7 +258,7 @@ class ControlNetBlock : public GGMLBlock {
// input block 0
auto h = input_blocks_0_0->forward(ctx, x);
- h = ggml_add(ctx, h, guided_hint);
+ h = ggml_add(ctx->ggml_ctx, h, guided_hint);
outs.push_back(zero_convs_0->forward(ctx, h));
// input block 1-11
@@ -274,7 +273,7 @@ class ControlNetBlock : public GGMLBlock {
h = resblock_forward(name, ctx, h, emb); // [N, mult*model_channels, h, w]
if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
- h = attention_layer_forward(name, ctx, backend, h, context); // [N, mult*model_channels, h, w]
+ h = attention_layer_forward(name, ctx, h, context); // [N, mult*model_channels, h, w]
}
auto zero_conv = std::dynamic_pointer_cast(blocks["zero_convs." + std::to_string(input_block_idx) + ".0"]);
@@ -298,9 +297,9 @@ class ControlNetBlock : public GGMLBlock {
// [N, 4*model_channels, h/8, w/8]
// middle_block
- h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
- h = attention_layer_forward("middle_block.1", ctx, backend, h, context); // [N, 4*model_channels, h/8, w/8]
- h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
+ h = resblock_forward("middle_block.0", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
+ h = attention_layer_forward("middle_block.1", ctx, h, context); // [N, 4*model_channels, h/8, w/8]
+ h = resblock_forward("middle_block.2", ctx, h, emb); // [N, 4*model_channels, h/8, w/8]
// out
outs.push_back(middle_block_out->forward(ctx, h));
@@ -312,39 +311,28 @@ struct ControlNet : public GGMLRunner {
SDVersion version = VERSION_SD1;
ControlNetBlock control_net;
- ggml_backend_buffer_t control_buffer = NULL; // keep control output tensors in backend memory
- ggml_context* control_ctx = NULL;
+ ggml_backend_buffer_t control_buffer = nullptr; // keep control output tensors in backend memory
+ ggml_context* control_ctx = nullptr;
std::vector controls; // (12 input block outputs, 1 middle block output) SD 1.5
- struct ggml_tensor* guided_hint = NULL; // guided_hint cache, for faster inference
+ struct ggml_tensor* guided_hint = nullptr; // guided_hint cache, for faster inference
bool guided_hint_cached = false;
ControlNet(ggml_backend_t backend,
bool offload_params_to_cpu,
- const String2GGMLType& tensor_types = {},
- SDVersion version = VERSION_SD1)
+ const String2TensorStorage& tensor_storage_map = {},
+ SDVersion version = VERSION_SD1)
: GGMLRunner(backend, offload_params_to_cpu), control_net(version) {
- control_net.init(params_ctx, tensor_types, "");
+ control_net.init(params_ctx, tensor_storage_map, "");
}
- void enable_conv2d_direct() {
- std::vector blocks;
- control_net.get_all_blocks(blocks);
- for (auto block : blocks) {
- if (block->get_desc() == "Conv2d") {
- auto conv_block = (Conv2d*)block;
- conv_block->enable_direct();
- }
- }
- }
-
- ~ControlNet() {
+ ~ControlNet() override {
free_control_ctx();
}
void alloc_control_ctx(std::vector outs) {
struct ggml_init_params params;
params.mem_size = static_cast(outs.size() * ggml_tensor_overhead()) + 1024 * 1024;
- params.mem_buffer = NULL;
+ params.mem_buffer = nullptr;
params.no_alloc = true;
control_ctx = ggml_init(params);
@@ -366,20 +354,20 @@ struct ControlNet : public GGMLRunner {
}
void free_control_ctx() {
- if (control_buffer != NULL) {
+ if (control_buffer != nullptr) {
ggml_backend_buffer_free(control_buffer);
- control_buffer = NULL;
+ control_buffer = nullptr;
}
- if (control_ctx != NULL) {
+ if (control_ctx != nullptr) {
ggml_free(control_ctx);
- control_ctx = NULL;
+ control_ctx = nullptr;
}
- guided_hint = NULL;
+ guided_hint = nullptr;
guided_hint_cached = false;
controls.clear();
}
- std::string get_desc() {
+ std::string get_desc() override {
return "control_net";
}
@@ -391,12 +379,12 @@ struct ControlNet : public GGMLRunner {
struct ggml_tensor* hint,
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
- struct ggml_tensor* y = NULL) {
- struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, CONTROL_NET_GRAPH_SIZE, false);
+ struct ggml_tensor* y = nullptr) {
+ struct ggml_cgraph* gf = new_graph_custom(CONTROL_NET_GRAPH_SIZE);
x = to_backend(x);
if (guided_hint_cached) {
- hint = NULL;
+ hint = nullptr;
} else {
hint = to_backend(hint);
}
@@ -404,16 +392,17 @@ struct ControlNet : public GGMLRunner {
y = to_backend(y);
timesteps = to_backend(timesteps);
- auto outs = control_net.forward(compute_ctx,
- runtime_backend,
+ auto runner_ctx = get_context();
+
+ auto outs = control_net.forward(&runner_ctx,
x,
hint,
- guided_hint_cached ? guided_hint : NULL,
+ guided_hint_cached ? guided_hint : nullptr,
timesteps,
context,
y);
- if (control_ctx == NULL) {
+ if (control_ctx == nullptr) {
alloc_control_ctx(outs);
}
@@ -431,8 +420,8 @@ struct ControlNet : public GGMLRunner {
struct ggml_tensor* timesteps,
struct ggml_tensor* context,
struct ggml_tensor* y,
- struct ggml_tensor** output = NULL,
- struct ggml_context* output_ctx = NULL) {
+ struct ggml_tensor** output = nullptr,
+ struct ggml_context* output_ctx = nullptr) {
// x: [N, in_channels, h, w]
// timesteps: [N, ]
// context: [N, max_position, hidden_size]([N, 77, 768]) or [1, max_position, hidden_size]
@@ -445,7 +434,7 @@ struct ControlNet : public GGMLRunner {
guided_hint_cached = true;
}
- bool load_from_file(const std::string& file_path) {
+ bool load_from_file(const std::string& file_path, int n_threads) {
LOG_INFO("loading control net from '%s'", file_path.c_str());
alloc_params_buffer();
std::map tensors;
@@ -453,12 +442,12 @@ struct ControlNet : public GGMLRunner {
std::set ignore_tensors;
ModelLoader model_loader;
- if (!model_loader.init_from_file(file_path)) {
+ if (!model_loader.init_from_file_and_convert_name(file_path)) {
LOG_ERROR("init control net model loader from file failed: '%s'", file_path.c_str());
return false;
}
- bool success = model_loader.load_tensors(tensors, ignore_tensors);
+ bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
if (!success) {
LOG_ERROR("load control net tensors from model loader failed");
diff --git a/denoiser.hpp b/denoiser.hpp
index 20d5f726a..5ff45bb2c 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -19,7 +19,7 @@ struct SigmaSchedule {
};
struct DiscreteSchedule : SigmaSchedule {
- std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+ std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector result;
int t_max = TIMESTEPS - 1;
@@ -43,7 +43,7 @@ struct DiscreteSchedule : SigmaSchedule {
};
struct ExponentialSchedule : SigmaSchedule {
- std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+ std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
std::vector sigmas;
// Calculate step size
@@ -150,7 +150,7 @@ std::vector log_linear_interpolation(std::vector sigma_in,
https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html
*/
struct AYSSchedule : SigmaSchedule {
- std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+ std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
const std::vector noise_levels[] = {
/* SD1.5 */
{14.6146412293f, 6.4745760956f, 3.8636745985f, 2.6946151520f,
@@ -204,7 +204,7 @@ struct AYSSchedule : SigmaSchedule {
* GITS Scheduler: https://github.com/zju-pi/diff-sampler/tree/main/gits-main
*/
struct GITSSchedule : SigmaSchedule {
- std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+ std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
if (sigma_max <= 0.0f) {
return std::vector{};
}
@@ -232,8 +232,27 @@ struct GITSSchedule : SigmaSchedule {
}
};
+struct SGMUniformSchedule : SigmaSchedule {
+ std::vector get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override {
+ std::vector result;
+ if (n == 0) {
+ result.push_back(0.0f);
+ return result;
+ }
+ result.reserve(n + 1);
+ int t_max = TIMESTEPS - 1;
+ int t_min = 0;
+ std::vector timesteps = linear_space(static_cast(t_max), static_cast(t_min), n + 1);
+ for (int i = 0; i < n; i++) {
+ result.push_back(t_to_sigma_func(timesteps[i]));
+ }
+ result.push_back(0.0f);
+ return result;
+ }
+};
+
struct KarrasSchedule : SigmaSchedule {
- std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
+ std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
// These *COULD* be function arguments here,
// but does anybody ever bother to touch them?
float rho = 7.f;
@@ -251,6 +270,35 @@ struct KarrasSchedule : SigmaSchedule {
}
};
+struct SimpleSchedule : SigmaSchedule {
+ std::vector