diff --git a/.gitignore b/.gitignore
index 6010b30..d28572a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 model_zoo/
 outputs/
+*benchmark_tmp.csv
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -130,6 +131,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+.venv*/
 
 # Spyder project settings
 .spyderproject
@@ -160,4 +162,6 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
\ No newline at end of file
+#.idea/
+
+.vscode/
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..8be068f
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,8 @@
+FROM nvidia/cuda:11.2.1-base-ubuntu20.04
+RUN apt-get update && \
+    apt-get install --no-install-recommends --no-install-suggests -y \
+    curl python3 python3-pip
+WORKDIR /lambda_diffusers
+COPY . .
+RUN pip3 install --no-cache-dir -r requirements.txt
+CMD ["python3", "-u", "scripts/benchmark.py", "--samples", "1,2,4,8,16"]
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..ea7b0ed
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Lambda, Inc
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index eea53d1..7306ced 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,11 @@ _Additional models and pipelines for 🤗 Diffusers created by [Lambda Labs](htt
 - [Stable Diffusion Image Variations](#stable-diffusion-image-variations)
 - [Pokemon text to image](#pokemon-text-to-image)
 
+
+<p align="center">
+🦄 Other exciting ML projects at Lambda: <a href="https://news.lambdalabs.com/news/today">ML Times</a>, <a href="https://github.com/LambdaLabsML/distributed-training-guide/tree/main">Distributed Training Guide</a>, <a href="https://lambdalabsml.github.io/Open-Sora/introduction/">Text2Video</a>, <a href="https://lambdalabs.com/gpu-benchmarks">GPU Benchmark</a>.
+</p>
+
 ## Installation
 
 ```bash
@@ -31,21 +36,33 @@ A fine-tuned version of Stable Diffusion conditioned on CLIP image embeddings to
 ### Usage
 
 ```python
-from pathlib import Path
-from lambda_diffusers import StableDiffusionImageEmbedPipeline
+from diffusers import StableDiffusionImageVariationPipeline
 from PIL import Image
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-pipe = StableDiffusionImageEmbedPipeline.from_pretrained("lambdalabs/sd-image-variations-diffusers")
-pipe = pipe.to(device)
-im = Image.open("your/input/image/here.jpg")
-num_samples = 4
-image = pipe(num_samples*[im], guidance_scale=3.0)
-image = image["sample"]
-base_path = Path("outputs/im2im")
-base_path.mkdir(exist_ok=True, parents=True)
-for idx, im in enumerate(image):
-    im.save(base_path/f"{idx:06}.jpg")
+
+device = "cuda:0"
+sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+  "lambdalabs/sd-image-variations-diffusers",
+  revision="v2.0",
+  )
+sd_pipe = sd_pipe.to(device)
+
+im = Image.open("path/to/image.jpg")
+tform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize(
+        (224, 224),
+        interpolation=transforms.InterpolationMode.BICUBIC,
+        antialias=False,
+        ),
+    transforms.Normalize(
+      [0.48145466, 0.4578275, 0.40821073],
+      [0.26862954, 0.26130258, 0.27577711]),
+])
+inp = tform(im).to(device)
+
+out = sd_pipe(inp, guidance_scale=3)
+out["images"][0].save("result.jpg")
+
 ```
 
 ## Pokemon text to image
@@ -54,6 +71,7 @@ __Stable Diffusion fine tuned on Pokémon by [Lambda Labs](https://lambdalabs.co
 
 [![Open in Replicate](https://img.shields.io/badge/%F0%9F%9A%80-Open%20in%20Replicate-%23fff891)](https://replicate.com/lambdal/text-to-pokemon)
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LambdaLabsML/lambda-diffusers/blob/main/notebooks/pokemon_demo.ipynb)
+[![Open in Spaces](https://img.shields.io/badge/%F0%9F%A4%97-Open%20in%20Spaces-orange)](https://huggingface.co/spaces/lambdalabs/text-to-pokemon)
 
 Put in a text prompt and generate your own Pokémon character, no "prompt engineering" required!
 
@@ -75,7 +93,7 @@ import torch
 from diffusers import StableDiffusionPipeline
 from torch import autocast
 
-pipe = StableDiffusionPipeline.from_pretrained("lambdalabs/sd-pokemon-diffusers", torch_dtype=torch.float16)  
+pipe = StableDiffusionPipeline.from_pretrained("lambdalabs/sd-pokemon-diffusers", torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 
 prompt = "Yoda"
@@ -98,6 +116,36 @@ for idx, im in enumerate(images):
   im.save(f"{idx:06}.png")
 ```
 
+## Benchmarking inference
+
+We have updated the original benchmark using xformers and a newer version of Diffusers, see the [new results here](./docs/benchmark-update.md) (original results can still be found [here](./docs/benchmark.md)).
+
+### Usage
+
+Ensure that [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) is installed on your system and then run the following:
+
+```bash
+git clone https://github.com/LambdaLabsML/lambda-diffusers.git
+cd lambda-diffusers/scripts
+make bench
+```
+
+Currently `xformers` does not support H100. The "without xformers" results below are generated by running the benchmark with `--xformers no` (can be set in `scripts/Makefile`)
+
+### Results
+
+With [xformers](https://github.com/facebookresearch/xformers), raw data can be found [here](./benchmarks/benchmark.csv).
+![](./docs/pictures/sd_throughput.png)
+
+Without [xformers](https://github.com/facebookresearch/xformers), raw data can be found [here](./benchmarks/benchmark_no_xformers.csv).
+![](./docs/pictures/sd_throughput_noxformer.png)
+
+H100 MIG performance, raw data can be found [here](./benchmarks/benchmark_H100_MIG.csv).
+![](./docs/pictures/sd_throughput_mig.png)
+
+Cost analysis
+![](./docs/pictures/cost_analysis.png)
+
 ## Links
 
 - [Captioned Pokémon dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions)
diff --git a/benchmarks/benchmark.csv b/benchmarks/benchmark.csv
new file mode 100644
index 0000000..aaa55ea
--- /dev/null
+++ b/benchmarks/benchmark.csv
@@ -0,0 +1,81 @@
+device,precision,autocast,xformers,runtime,n_samples,latency,memory
+NVIDIA A10,half,FALSE,TRUE,pytorch,1,2.01,3.13
+NVIDIA A10,single,FALSE,TRUE,pytorch,1,4.69,6.29
+NVIDIA A10,half,FALSE,TRUE,pytorch,2,3.65,4.3
+NVIDIA A10,single,FALSE,TRUE,pytorch,2,7.75,8.57
+NVIDIA A10,half,FALSE,TRUE,pytorch,4,6.68,6.63
+NVIDIA A10,single,FALSE,TRUE,pytorch,4,14.35,11.24
+NVIDIA A10,half,FALSE,TRUE,pytorch,8,12.93,11.05
+NVIDIA A10,single,FALSE,TRUE,pytorch,8,28.28,17.91
+NVIDIA A10,half,FALSE,TRUE,pytorch,16,24.65,19.86
+NVIDIA A10,single,FALSE,TRUE,pytorch,16,57.5,21.21
+NVIDIA A10,half,FALSE,TRUE,pytorch,32,48.79,7.37
+NVIDIA A10,single,FALSE,TRUE,pytorch,32,108.78,15.88
+NVIDIA A10,half,FALSE,TRUE,pytorch,64,108.26,17.54
+NVIDIA A10,single,FALSE,TRUE,pytorch,64,-1,-1
+NVIDIA A10,half,FALSE,TRUE,pytorch,128,212.94,22.18
+NVIDIA A10,single,FALSE,TRUE,pytorch,128,-1,-1
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,1,1.78,6.1
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,1,1.17,3.19
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,2,3.68,8.03
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,2,1.73,4.33
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,4,5.56,11.53
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,4,3.73,6.62
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,8,10.95,18.12
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,8,5.25,11.12
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,16,21.05,33.04
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,16,9.93,19.81
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,32,41.02,14.41
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,32,18.75,7.34
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,64,80.45,26.17
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,64,36.89,12.46
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,128,161.52,48.01
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,128,73.72,22.68
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,1,1.79,6.11
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,1,1.18,3.18
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,2,2.97,8.03
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,2,1.66,4.32
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,4,5.35,11.54
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,4,2.68,6.61
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,8,10.16,18.11
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,8,4.85,11.12
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,16,9.13,19.8
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,16,19.71,33.25
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,32,17.72,7.33
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,32,39.03,14.39
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,64,34.92,13.79
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,64,77.05,26.34
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,128,69.31,22.68
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,128,-1,-1
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,1,3.61,6.35
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,1,1.93,3.15
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,2,5.57,7.73
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,2,2.84,4.37
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,4,9.67,10.7
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,4,4.56,6.64
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,8,18.96,16.87
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,8,8.39,11.19
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,16,37.89,28.82
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,16,15.62,20.01
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,32,71.57,14.26
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,32,31.19,7.65
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,64,143.26,26.42
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,64,65.72,23.84
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,128,287.96,47.92
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,128,130.38,34.36
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,1,4.42,5.7
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,1,1.84,3.24
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,2,8.33,8.6
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,2,3.08,4.17
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,4,16.56,11.86
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,4,5.62,6.42
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,8,28.71,15.88
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,8,10.64,10.45
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,16,20.96,10.87
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,16,-1,-1
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,32,40.13,7.73
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,32,110.17,15.72
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,64,79.82,13.51
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,64,-1,-1
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,128,-1,-1
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,128,-1,-1
diff --git a/benchmarks/benchmark_H100_MIG.csv b/benchmarks/benchmark_H100_MIG.csv
new file mode 100644
index 0000000..87c70dd
--- /dev/null
+++ b/benchmarks/benchmark_H100_MIG.csv
@@ -0,0 +1,65 @@
+device,precision,autocast,xformers,runtime,n_samples,latency,memory,
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,1,1.73,7.7
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,1,1.06,3.46
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,2,2.66,9.79
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,2,1.73,4.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,4,4.47,18.49
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,4,2.63,8.91
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,8,8.16,23.86
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,8,4.97,12.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,16,15.98,42.38
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,16,9.61,29.01
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,32,32.04,80.51
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,32,19.07,55.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,1,2.3,7.74
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,1,1.52,3.45
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,2,3.95,9.48
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,2,2.42,4.57
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,4,7.12,18.2
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,4,4.17,8.9
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,8,13.91,23.75
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,8,7.91,12.49
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,16,15.73,29.01
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,1,4.2,7.76
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,1,2.58,3.41
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,2,7.61,11.09
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,2,4.56,4.59
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,4,14.45,17.65
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,4,8.24,6.78
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,8,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,8,15.81,15.65
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,1,9.17,7.76
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,1,5.39,3.47
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,2,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,2,9.29,4.63
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,4,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,4,17.4,6.8
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,8,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,8,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,128,-1,-1
\ No newline at end of file
diff --git a/benchmarks/benchmark_no_xformers.csv b/benchmarks/benchmark_no_xformers.csv
new file mode 100644
index 0000000..d578b6d
--- /dev/null
+++ b/benchmarks/benchmark_no_xformers.csv
@@ -0,0 +1,97 @@
+device,precision,autocast,xformers,runtime,n_samples,latency,memory,
+NVIDIA A10,single,FALSE,FALSE,pytorch,1,4.75,6.73
+NVIDIA A10,half,FALSE,FALSE,pytorch,1,2.71,3.43
+NVIDIA A10,single,FALSE,FALSE,pytorch,2,8.75,9
+NVIDIA A10,half,FALSE,FALSE,pytorch,2,4.99,5.53
+NVIDIA A10,single,FALSE,FALSE,pytorch,4,17.18,18.14
+NVIDIA A10,half,FALSE,FALSE,pytorch,4,9.65,6.84
+NVIDIA A10,single,FALSE,FALSE,pytorch,8,-1,-1
+NVIDIA A10,half,FALSE,FALSE,pytorch,8,18.58,12.66
+NVIDIA A10,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA A10,half,FALSE,FALSE,pytorch,16,36.32,20.64
+NVIDIA A10,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A10,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A10,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A10,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A10,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A10,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,1,1.72,7.76
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,1,1.18,3.41
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,2,3.03,9.04
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,2,1.88,5.53
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,4,5.53,18.04
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,4,3.35,6.74
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,8,10.95,23.85
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,8,6.28,12.6
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,16,12.57,20.58
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,1,1.99,7.76
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,1,1.5,3.45
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,2,3.52,11.11
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,2,2.3,4.53
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,4,6.31,13.98
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,4,4.04,8.91
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,8,12.21,23.91
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,8,7.59,12.75
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,16,14.54,21.24
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,1,2.05,7.76
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,1,1.53,3.41
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,2,3.09,9.04
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,2,3.06,5.53
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,4,6.34,18.04
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,4,4.57,6.74
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,8,11.16,23.85
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,8,7.91,12.6
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,16,22.59,42.63
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,16,14.22,20.58
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,32,44.02,79.6
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,32,27.73,45.19
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,64,-1.0,-1.0
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,64,55.55,79.54
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,128,-1.0,-1.0
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,128,-1.0,-1.0
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,1,4.15,6.76
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,1,2.43,3.42
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,2,6,11.1
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,2,3.88,4.5
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,4,12.85,13.97
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,4,7.77,8.88
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,8,32.69,23.88
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,8,21.21,12.74
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,16,81.14,42.77
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,16,48.49,21.23
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,1,1.73,7.7
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,1,1.06,3.46
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,2,2.66,9.79
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,2,1.73,4.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,4,4.47,18.49
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,4,2.63,8.91
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,8,8.16,23.86
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,8,4.97,12.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,16,15.98,42.38
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,16,9.61,29.01
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,32,32.04,80.51
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,32,19.07,55.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,128,-1,-1
diff --git a/docs/benchmark-update.md b/docs/benchmark-update.md
new file mode 100644
index 0000000..b383e01
--- /dev/null
+++ b/docs/benchmark-update.md
@@ -0,0 +1,23 @@
+# Benchmark update
+
+We are currently running benchmarks to update our Stable Diffusion numbers using a more recent version of Diffusers and to take advantage of xformers. THe interim results on a limited set of GPUs are presented here.
+
+## Running the benchmark
+
+Ensure that [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) is installed on your system and then run the following:
+
+```bash
+git clone https://github.com/LambdaLabsML/lambda-diffusers.git
+cd lambda-diffusers/scripts
+make bench
+```
+
+Results will be written to `results.csv`, the benchmark will take different amounts of time depending on the GPU present but expect it to take at least several minutes.
+
+## Results
+
+The current results for the benchmark are available in [`benchmark.csv`](../benchmarks/benchmark.csv). These results were run with Diffusers 0.11.0 and xformers using Ubuntu 20.04, Python 3.8, PyTorch 1.13, CUDA 11.8 ([NGC PyTorch container 22.11](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-11.html)).
+
+xformers provides a significant boost in performance and memory consumption allowing large batch sizes to maximise utilisation of GPUs. Our best performance comes using NVIDIA A100-SXM4-40GB on [Lambda GPU cloud](https://cloud.lambdalabs.com), at the maximum batch size tested (128) at half precision we observe a throughput of 1.85 images/second when using DDIM 30 steps for sampling.
+
+![](./pictures/sd_throughput.png)
\ No newline at end of file
diff --git a/docs/benchmark.md b/docs/benchmark.md
new file mode 100644
index 0000000..f16ea47
--- /dev/null
+++ b/docs/benchmark.md
@@ -0,0 +1,184 @@
+# Benchmarking Diffuser Models
+
+__We are currently in the process of updating our Stable Diffusion benchmark using more recent version of Diffusers and taking advantage of xformers. See the summary of interim result [here](./benchmark-update.md)__
+
+We present a benchmark of [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) model inference.  This text2image model uses a text prompt as input and outputs an image of resolution `512x512`.
+
+Our experiments analyze inference performance in terms of speed, memory consumption, throughput, and quality of the output images. We look at how different choices in hardware (GPU model, GPU vs CPU) and software (single vs half precision, pytorch vs onnxruntime) affect inference performance.
+
+For reference, we will be providing benchmark results for the following GPU devices: A100 80GB PCIe, RTX3090, RTXA5500, RTXA6000, RTX3080, RTX8000. Please refer to the ["Reproducing the experiments"](#reproducing-the-experiments) section for details on running these experiments in your own environment.
+
+
+## Inference speed
+
+The figure below shows the latency at inference when using different hardware and precision for generating a single image using the (arbitrary) text prompt: *"a photo of an astronaut riding a horse on mars"*.
+
+<img src="http://github.com/pictures/pretty_benchmark_sd_txt2img_latency.png" alt="Stable Diffusion Text2Image Latency (seconds)" width="800"/>
+
+
+We find that:
+* The inference latencies range between `3.74` to `5.56` seconds across our tested Ampere GPUs, including the consumer 3080 card to the flagship A100 80GB card.
+* Half-precision reduces the latency by about `40%` for Ampere GPUs, and by `52%` for the previous generation `RTX8000` GPU.
+
+We believe Ampere GPUs enjoy a relatively "smaller" speedup from half-precision due to their use of `TF32`. For readers who are not familiar with `TF32`, it is a [`19-bit` format](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) that has been used as the default single-precision data type on Ampere GPUs for major deep learning frameworks such as PyTorch and TensorFlow. One can expect half-precision's speedup over `FP32` to be bigger since it is a true `32-bit` format.
+
+
+We run these same inference jobs CPU devices to put in perspective the inference speed performance observed on GPU.
+
+<img src="http://github.com/pictures/pretty_benchmark_sd_txt2img_gpu_vs_cpu.png" alt="Stable Diffusion Text2Image GPU v CPU" width="700"/>
+
+
+We note that:
+* GPUs are significantly faster -- by one or two orders of magnitudes depending on the precisions.
+* `onnxruntime` can reduce the latency for CPU by about `40%` to `50%`, depending on the type of CPUs.
+
+ONNX currently does not have [stable support](https://github.com/huggingface/diffusers/issues/489) for Huggingface diffusers.
+We will investigate `onnxruntime-gpu` in future benchmarks.
+
+
+
+
+## Memory
+
+We also measure the memory consumption of running stable diffusion inference.
+
+<img src="http://github.com/pictures/pretty_benchmark_sd_txt2img_mem.png" alt="Stable Diffusion Text2Image Memory (GB)" width="640"/>
+
+Memory usage is observed to be consistent across all tested GPUs:
+* It takes about `7.7 GB` GPU memory to run single-precision inference with batch size one.
+* It takes about `4.5 GB` GPU memory to run half-precision inference with batch size one.
+
+
+
+
+## Throughput
+
+Latency measures how quickly a _single_ input can be processed, which is critical to online applications that don't tolerate even the slightest delay. However, some (offline) applications may focus on "throughput", which measures the total volume of data processed in a fixed amount of time.
+
+
+Our throughput benchmark pushes the batch size to the maximum for each GPU, and measures the number of images they can process per minute. The reason for maximizing the batch size is to keep tensor cores busy so that computation can dominate the workload, avoiding any non-computational bottlenecks.
+
+We run a series of throughput experiment in pytorch with half-precision and using the maximum batch size that can be used for each GPU:
+
+<img src="http://github.com/pictures/pretty_benchmark_sd_txt2img_throughput.png" alt="Stable Diffusion Text2Image Throughput (images/minute)" width="390"/>
+
+We note:
+* Once again, A100 80GB is the top performer and has the highest throughput.
+* The gap between A100 80GB and other cards in terms of throughput can be explained by the larger maximum batch size that can be used on this card.
+
+
+As a concrete example, the chart below shows how A100 80GB's throughput increases by `64%` when we changed the batch size from 1 to 28 (the largest without causing an out of memory error). It is also interesting to see that the increase is not linear and flattens out when batch size reaches a certain value, at which point the tensor cores on the GPU are saturated and any new data in the GPU memory will have to be queued up before getting their own computing resources.
+
+<img src="http://github.com/pictures/pretty_benchmark_sd_txt2img_batchsize_vs_throughput.png" alt="Stable Diffusion Text2Image Batch size vs Throughput (images/minute)" width="380"/>
+
+
+## Precision
+
+We are curious about whether half-precision introduces degradations to the quality of the output images. To test this out, we fixed the text prompt as well as the "latent" input vector and fed them to the single-precision model and the half-precision model. We ran the inference for 100 steps and saved both models' outputs at each step, as well as the difference map:
+
+![Evolution of precision v degradation across 100 steps](./pictures/benchmark_sd_precision_history.gif)
+
+Our observation is that there are indeed visible differences between the single-precision output and the half-precision output, especially in the early steps. The differences often decrease with the number of steps, but might not always vanish.
+
+Interestingly, such a difference may not imply artifacts in half-precision's outputs. For example, in step 70, the picture below shows half-precision didn't produce the artifact in the single-precision output (an extra front leg):
+
+![Precision v Degradation at step 70](./pictures/benchmark_sd_precision_step_70.png)
+
+---
+
+## Reproducing the experiments
+
+You can use this [Lambda Diffusers](https://github.com/LambdaLabsML/lambda-diffusers) repository to reproduce the results presented in this article.
+
+### From your local machine
+
+#### Setup
+
+Before running the benchmark, make sure you have completed the repository [installation steps](../README.md#installation).
+
+You will then need to set the huggingface access token:
+1. Create a user account on HuggingFace and generate an access token.
+2. Set your huggingface access token as the `ACCESS_TOKEN` environment variable:
+```
+export ACCESS_TOKEN=<hf_...>
+```
+
+#### Usage
+
+Launch the `benchmark.py` script to append benchmark results to the existing [benchmark.csv](../benchmark.csv) results file:
+```
+python ./scripts/benchmark.py
+```
+
+Lauch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models:
+```
+python ./scripts/benchmark_quality.py
+```
+
+
+### From a docker container
+
+The following instructions show how to run the benchmarking program from a docker container on Ubuntu.
+
+#### Prerequisites
+
+#### Get a huggingface access token
+
+Create a huggingface account.
+Get a [huggingface access token](https://huggingface.co/docs/hub/security-tokens).
+
+#### Install NVIDIA docker
+
+This section can be skipped if the environment already uses [Lambda Stack](https://lambdalabs.com/lambda-stack-deep-learning-software) or if the experiments are running on a [Lambda cloud](https://lambdalabs.com/service/gpu-cloud) instance as docker and `nvidia-container-toolkit` comes pre-installed in these cases.
+
+We first install docker:
+```
+# Install
+sudo apt-get update
+sudo apt-get remove docker docker-engine docker.io -y
+sudo apt install containerd -y
+sudo apt install docker.io -y
+sudo systemctl start docker
+sudo systemctl enable docker
+# Test install
+docker --version
+# Put the user in the docker group
+sudo usermod -a -G docker $USER
+newgrp docker
+```
+
+We install requirements to run docker containers leveraging NVIDIA GPUs:
+```
+# Install
+sudo apt install curl
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
+sudo systemctl restart docker
+# Test install
+sudo docker run --rm --gpus all nvidia/cuda:11.2.1-base-ubuntu20.04 nvidia-smi
+```
+
+
+3. Build the benchmark docker image
+
+```
+docker build -t benchmark -f ./benchmarking/Dockerfile .
+```
+
+#### Running the benchmark
+
+Set the HuggingFace access token as environment variable:
+```
+export ACCESS_TOKEN=<your-hugging-face-access-token-here>
+```
+
+Run the benchmark program from the container and export the output `.csv` file to the host:
+```
+containerid=$(docker run --gpus all -e ACCESS_TOKEN=${ACCESS_TOKEN} -d --entrypoint "python3" benchmark:latest scripts/benchmark.py --samples=1,2,4,8,16) && \
+docker wait ${containerid} && \
+docker cp ${containerid}:/lambda_diffusers/benchmark_tmp.csv ./benchmark_tmp.csv
+```
+
+*Note that the arguments `scripts/benchmark.py --samples=1,2,4,8,16` can be changed to point to a different script or use different arguments.*
\ No newline at end of file
diff --git a/docs/pictures/FreeMono.ttf b/docs/pictures/FreeMono.ttf
new file mode 100644
index 0000000..7485f9e
Binary files /dev/null and b/docs/pictures/FreeMono.ttf differ
diff --git a/docs/pictures/benchmark_sd_precision_history.gif b/docs/pictures/benchmark_sd_precision_history.gif
new file mode 100644
index 0000000..7cc1fde
Binary files /dev/null and b/docs/pictures/benchmark_sd_precision_history.gif differ
diff --git a/docs/pictures/benchmark_sd_precision_step_70.png b/docs/pictures/benchmark_sd_precision_step_70.png
new file mode 100644
index 0000000..b7e5359
Binary files /dev/null and b/docs/pictures/benchmark_sd_precision_step_70.png differ
diff --git a/docs/pictures/cost_analysis.png b/docs/pictures/cost_analysis.png
new file mode 100644
index 0000000..2b5a473
Binary files /dev/null and b/docs/pictures/cost_analysis.png differ
diff --git a/docs/pictures/pretty_benchmark_sd_txt2img_batchsize_vs_throughput.png b/docs/pictures/pretty_benchmark_sd_txt2img_batchsize_vs_throughput.png
new file mode 100644
index 0000000..82f2b99
Binary files /dev/null and b/docs/pictures/pretty_benchmark_sd_txt2img_batchsize_vs_throughput.png differ
diff --git a/docs/pictures/pretty_benchmark_sd_txt2img_gpu_vs_cpu.png b/docs/pictures/pretty_benchmark_sd_txt2img_gpu_vs_cpu.png
new file mode 100644
index 0000000..9e633de
Binary files /dev/null and b/docs/pictures/pretty_benchmark_sd_txt2img_gpu_vs_cpu.png differ
diff --git a/docs/pictures/pretty_benchmark_sd_txt2img_latency.png b/docs/pictures/pretty_benchmark_sd_txt2img_latency.png
new file mode 100644
index 0000000..124bb40
Binary files /dev/null and b/docs/pictures/pretty_benchmark_sd_txt2img_latency.png differ
diff --git a/docs/pictures/pretty_benchmark_sd_txt2img_mem.png b/docs/pictures/pretty_benchmark_sd_txt2img_mem.png
new file mode 100644
index 0000000..ea6a2f8
Binary files /dev/null and b/docs/pictures/pretty_benchmark_sd_txt2img_mem.png differ
diff --git a/docs/pictures/pretty_benchmark_sd_txt2img_throughput.png b/docs/pictures/pretty_benchmark_sd_txt2img_throughput.png
new file mode 100644
index 0000000..978c06b
Binary files /dev/null and b/docs/pictures/pretty_benchmark_sd_txt2img_throughput.png differ
diff --git a/docs/pictures/sd_throughput.png b/docs/pictures/sd_throughput.png
new file mode 100644
index 0000000..06e0b15
Binary files /dev/null and b/docs/pictures/sd_throughput.png differ
diff --git a/docs/pictures/sd_throughput_mig.png b/docs/pictures/sd_throughput_mig.png
new file mode 100644
index 0000000..5e813a1
Binary files /dev/null and b/docs/pictures/sd_throughput_mig.png differ
diff --git a/docs/pictures/sd_throughput_noxformer.png b/docs/pictures/sd_throughput_noxformer.png
new file mode 100644
index 0000000..baae962
Binary files /dev/null and b/docs/pictures/sd_throughput_noxformer.png differ
diff --git a/lambda_diffusers/pipelines/pipeline_stable_diffusion_im_embed.py b/lambda_diffusers/pipelines/pipeline_stable_diffusion_im_embed.py
index 7b05876..486765f 100644
--- a/lambda_diffusers/pipelines/pipeline_stable_diffusion_im_embed.py
+++ b/lambda_diffusers/pipelines/pipeline_stable_diffusion_im_embed.py
@@ -25,7 +25,6 @@ def __init__(
         feature_extractor: CLIPFeatureExtractor,
     ):
         super().__init__()
-        scheduler = scheduler.set_format("pt")
         self.register_modules(
             vae=vae,
             image_encoder=image_encoder,
diff --git a/requirements.txt b/requirements.txt
index b41456b..9089b38 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,9 @@
-torch
-torchvision
-transformers
-ftfy
-Pillow
-diffusers
--e .
\ No newline at end of file
+--extra-index-url https://download.pytorch.org/whl/cu116 torch
+
+transformers==4.22.1
+ftfy==6.1.1
+Pillow==9.2.0
+diffusers==0.3.0
+onnxruntime==1.12.1
+scikit-image==0.19.3
+-e .
diff --git a/scripts/Dockerfile b/scripts/Dockerfile
new file mode 100644
index 0000000..beb9793
--- /dev/null
+++ b/scripts/Dockerfile
@@ -0,0 +1,13 @@
+FROM nvcr.io/nvidia/pytorch:22.11-py3
+
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+
+RUN pip install --pre xformers
+RUN pip install diffusers==0.11.0 accelerate transformers
+
+WORKDIR /workspace
+
+COPY benchmark.py /workspace/benchmark.py
+RUN (printf '#!/bin/bash\npython benchmark.py \"$@\"\n' >> /entry.sh) && chmod a+x /entry.sh
+ENTRYPOINT ["/entry.sh"]
\ No newline at end of file
diff --git a/scripts/Makefile b/scripts/Makefile
new file mode 100644
index 0000000..9b722fb
--- /dev/null
+++ b/scripts/Makefile
@@ -0,0 +1,17 @@
+bench:
+	docker build -t sd-bench .
+	docker run \
+		--rm -it \
+		--gpus all \
+		--shm-size=128g \
+		--net=host \
+		-v $(PWD):/workspace/results \
+		sd-bench \
+		--steps 30 \
+		--samples 1,2,4,8,16,32,64,128 \
+		--autocast no \
+		--xformers yes \
+		--output_file /workspace/results/results.csv
+
+clean:
+	rm results.csv
\ No newline at end of file
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
new file mode 100644
index 0000000..eef4818
--- /dev/null
+++ b/scripts/benchmark.py
@@ -0,0 +1,294 @@
+import os
+import subprocess
+import multiprocessing
+import argparse
+import pathlib
+import csv
+from contextlib import nullcontext
+import itertools
+import torch
+from torch import autocast
+from diffusers import StableDiffusionPipeline, StableDiffusionOnnxPipeline
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+
+def make_bool(yes_or_no):
+    if yes_or_no.lower() == "yes":
+        return True
+    elif yes_or_no.lower() == "no":
+        return False
+    else:
+        raise ValueError(f"unrecognised input {yes_or_no}")
+
+def get_inference_pipeline(precision, backend):
+    """
+    returns HuggingFace diffuser pipeline
+    cf https://github.com/huggingface/diffusers#text-to-image-generation-with-stable-diffusion
+    """
+
+    assert precision in ("half", "single"), "precision in ['half', 'single']"
+    assert backend in ("pytorch", "onnx"), "backend in ['pytorch', 'onnx']"
+
+    if backend == "pytorch":
+        pipe = StableDiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            revision="main" if precision == "single" else "fp16",
+            torch_dtype=torch.float32 if precision == "single" else torch.float16,
+        )
+        pipe = pipe.to(device)
+    else:
+        pipe = StableDiffusionOnnxPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            use_auth_token=os.environ["ACCESS_TOKEN"],
+            revision="onnx",
+            provider="CPUExecutionProvider"
+            if device.type == "cpu"
+            else "CUDAExecutionProvider",
+            torch_dtype=torch.float32 if precision == "single" else torch.float16,
+        )
+
+    # Disable safety
+    disable_safety = True
+    if disable_safety:
+
+        def null_safety(images, **kwargs):
+            return images, False
+
+        pipe.safety_checker = null_safety
+    return pipe
+
+
+def do_inference(pipe, n_samples, use_autocast, num_inference_steps):
+    torch.cuda.empty_cache()
+    context = (
+        autocast if (device.type == "cuda" and use_autocast) else nullcontext
+    )
+    with context("cuda"):
+        images = pipe(
+            prompt=[prompt] * n_samples, num_inference_steps=num_inference_steps
+        ).images
+
+    return images
+
+
+def get_inference_time(
+    pipe, n_samples, n_repeats, use_autocast, num_inference_steps
+):
+    from torch.utils.benchmark import Timer
+
+    timer = Timer(
+        stmt="do_inference(pipe, n_samples, use_autocast, num_inference_steps)",
+        setup="from __main__ import do_inference",
+        globals={
+            "pipe": pipe,
+            "n_samples": n_samples,
+            "use_autocast": use_autocast,
+            "num_inference_steps": num_inference_steps,
+        },
+        num_threads=multiprocessing.cpu_count(),
+    )
+    profile_result = timer.timeit(
+        n_repeats
+    )  # benchmark.Timer performs 2 iterations for warmup
+    return round(profile_result.mean, 2)
+
+
+def get_inference_memory(pipe, n_samples, use_autocast, num_inference_steps):
+    if not torch.cuda.is_available():
+        return 0
+
+    torch.cuda.empty_cache()
+    context = autocast if (device.type == "cuda" and use_autocast) else nullcontext
+    with context("cuda"):
+        images = pipe(
+            prompt=[prompt] * n_samples, num_inference_steps=num_inference_steps
+        ).images
+
+    mem = torch.cuda.memory_reserved()
+    return round(mem / 1e9, 2)
+
+@torch.inference_mode()
+def run_benchmark(
+    n_repeats, n_samples, precision, use_autocast, xformers, backend, num_inference_steps
+):
+    """
+    * n_repeats: nb datapoints for inference latency benchmark
+    * n_samples: number of samples to generate (~ batch size)
+    * precision: 'half' or 'single' (use fp16 or fp32 tensors)
+
+    returns:
+    dict like {'memory usage': 17.70, 'latency': 86.71'}
+    """
+
+    print(f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\txformers: {xformers}\tbackend: {backend}")
+
+    pipe = get_inference_pipeline(precision, backend)
+    if xformers:
+        pipe.enable_xformers_memory_efficient_attention()
+
+    if n_samples>16:
+        pipe.enable_vae_slicing()
+
+    logs = {
+        "memory": 0.00
+        if device.type == "cpu"
+        else get_inference_memory(
+            pipe, n_samples, use_autocast, num_inference_steps
+        ),
+        "latency": get_inference_time(
+            pipe, n_samples, n_repeats, use_autocast, num_inference_steps
+        ),
+    }
+    print(logs, "\n")
+    print("============================")
+    return logs
+
+
+def get_device_description():
+    """
+    returns descriptor of cuda device such as
+    'NVIDIA RTX A6000'
+    """
+    if device.type == "cpu":
+        name = subprocess.check_output(
+            "grep -m 1 'model name' /proc/cpuinfo", shell=True
+        ).decode("utf-8")
+        name = " ".join(name.split(" ")[2:]).strip()
+        return name
+    else:
+        return torch.cuda.get_device_name()
+
+
+def run_benchmark_grid(grid, n_repeats, num_inference_steps, csv_fpath):
+    """
+    * grid : dict like
+        {
+            "n_samples": (1, 2),
+            "precision": ("single", "half"),
+            "autocast" : ("yes", "no")
+        }
+    * n_repeats: nb datapoints for inference latency benchmark
+    """
+
+    # create benchmark.csv if not exists
+    if not os.path.isfile(csv_fpath):
+        header = [
+            "device",
+            "precision",
+            "autocast",
+            "xformers"
+            "runtime",
+            "n_samples",
+            "latency",
+            "memory",
+        ]
+        with open(csv_fpath, "w") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+
+    # append new benchmark results to it if benchmark_tmp.csv already exists
+    with open(csv_fpath, "a") as f:
+        writer = csv.writer(f)
+        device_desc = get_device_description()
+        for trial in itertools.product(*grid.values()):
+
+            n_samples, precision, use_autocast, xformers, backend = trial
+            use_autocast = make_bool(use_autocast)
+            xformers = make_bool(xformers)
+
+            try:
+                new_log = run_benchmark(
+                    n_repeats=n_repeats,
+                    n_samples=n_samples,
+                    precision=precision,
+                    use_autocast=use_autocast,
+                    xformers=xformers,
+                    backend=backend,
+                    num_inference_steps=num_inference_steps,
+                )
+            except Exception as e:
+                if "CUDA out of memory" in str(
+                    e
+                ) or "Failed to allocate memory" in str(e):
+                    print(str(e))
+                    torch.cuda.empty_cache()
+                    new_log = {"latency": -1.00, "memory": -1.00}
+                else:
+                    raise e
+
+            latency = new_log["latency"]
+            memory = new_log["memory"]
+            new_row = [
+                device_desc,
+                precision,
+                use_autocast,
+                xformers,
+                backend,
+                n_samples,
+                latency,
+                memory,
+            ]
+            writer.writerow(new_row)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--samples",
+        default="1",
+        type=str,
+        help="Comma sepearated list of batch sizes (number of samples)",
+    )
+
+    parser.add_argument(
+        "--steps", default=50, type=int, help="Number of diffusion steps."
+    )
+
+    parser.add_argument(
+        "--repeats",
+        default=3,
+        type=int,
+        help="Number of repeats.",
+    )
+
+    parser.add_argument(
+        "--autocast",
+        default="no",
+        type=str,
+        help="If 'yes', will perform additional runs with autocast activated for half precision inferences",
+    )
+
+    parser.add_argument(
+        "--xformers",
+        default="yes",
+        type=str,
+        help="If 'yes', will use xformers flash attention",
+    )
+
+    parser.add_argument(
+        "--output_file",
+        default="results.py",
+        type=str,
+        help="Path to output csv file to write",
+    )
+
+    args = parser.parse_args()
+
+    grid = {
+        "n_samples": tuple(map(int, args.samples.split(","))),
+        # Only use single-precision for cpu because "LayerNormKernelImpl" not implemented for 'Half' on cpu,
+        # Remove autocast won't help. Ref:
+        # https://github.com/CompVis/stable-diffusion/issues/307
+        "precision": ("single",) if device.type == "cpu" else ("single", "half"),
+        "autocast": args.autocast.split(","),
+        "xformers": args.xformers.split(","),
+        # Only use onnx for cpu, until issues are fixed by upstreams. Ref:
+        # https://github.com/huggingface/diffusers/issues/489#issuecomment-1261577250
+        # https://github.com/huggingface/diffusers/pull/440
+        "backend": ("pytorch", "onnx") if device.type == "cpu" else ("pytorch",),
+    }
+    run_benchmark_grid(grid, n_repeats=args.repeats, num_inference_steps=args.steps, csv_fpath=args.output_file)
diff --git a/scripts/benchmark_quality.py b/scripts/benchmark_quality.py
new file mode 100644
index 0000000..03151d4
--- /dev/null
+++ b/scripts/benchmark_quality.py
@@ -0,0 +1,119 @@
+import os
+from platform import mac_ver
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+from skimage.metrics import structural_similarity as ssim
+from skimage.metrics import normalized_root_mse as nmse
+from skimage.metrics import peak_signal_noise_ratio as psnr
+
+import torch
+from torch import autocast
+from diffusers import StableDiffusionPipeline
+
+model_id = "CompVis/stable-diffusion-v1-4"
+device = "cuda"
+prompt = "a photo of an astronaut riding a horse on mars"
+output_folder = "_".join(prompt.split(" "))
+os.makedirs(output_folder, exist_ok=True)
+
+num_images = 1
+width = 512
+height = 512
+min_inference_steps = 10
+max_inference_steps = 100
+
+list_ssim = []
+list_nmse = []
+list_psnr = []
+
+# Create piplines for single and half-precision
+pipe = StableDiffusionPipeline.from_pretrained(
+    model_id, 
+    use_auth_token=True,
+    torch_dtype=torch.float32)
+pipe = pipe.to(device)
+
+pipe_half = StableDiffusionPipeline.from_pretrained(
+    model_id,
+    revision="fp16",
+    torch_dtype=torch.float16,
+    use_auth_token=True)
+pipe_half = pipe_half.to(device)
+
+# Generate latent vectors
+generator = torch.Generator(device=device)
+latents = None
+seeds = []
+for _ in range(num_images):
+    # Get a new random seed, store it and use it as the generator state
+    seed = generator.seed()
+    seeds.append(seed)
+    generator = generator.manual_seed(seed)
+    
+    image_latents = torch.randn(
+        (1, pipe.unet.in_channels, height // 8, width // 8),
+        generator = generator,
+        device = device
+    )
+    latents = image_latents if latents is None else torch.cat((latents, image_latents))
+
+for num_inference_steps in range(min_inference_steps, max_inference_steps, 5):
+    # Inference with single and half-precision
+
+    torch.cuda.empty_cache()
+    images = pipe(
+            [prompt] * num_images,
+            guidance_scale=7.5,
+            latents = latents,
+            num_inference_steps = num_inference_steps
+    )["sample"]
+
+    torch.cuda.empty_cache()
+    with torch.autocast(device):
+        images_half = pipe_half(
+                [prompt] * num_images,
+                guidance_scale=7.5,
+                latents = latents,
+                num_inference_steps = num_inference_steps
+        )["sample"]
+
+    m_ssim = []
+    m_nmse = []
+    m_psnr = []
+
+    for idx, (image, image_half) in enumerate(zip(images, images_half)):
+        # Need to convert to float because uint8 can't store negative value
+        np_image = np.float32(np.asarray(image)) / 255.0
+        np_image_half = np.float32(np.asarray(image_half)) / 255.0
+        np_image_diff = np.absolute(np.float32(np.asarray(image)) - np.float32(np.asarray(image_half)))
+
+        # Compute quantitative metrics
+        m_ssim.append(ssim(np_image, np_image_half, channel_axis=2))
+        m_nmse.append(nmse(np_image, np_image_half))
+        m_psnr.append(psnr(np_image, np_image_half))
+        im_diff = Image.fromarray(np.uint8(np_image_diff))
+
+        # Compose results in a single output image
+        dst = Image.new('RGB', (image.width + image_half.width + im_diff.width, image.height))
+        dst.paste(image, (0, 0))
+        dst.paste(image_half, (image.width, 0))
+        dst.paste(im_diff, (image.width + image_half.width, 0))
+        I1 = ImageDraw.Draw(dst)
+        font = ImageFont.truetype('../docs/pictures/FreeMono.ttf', 16)
+        I1.text((32, 32), "Single Precision", font=font, fill=(255, 255, 255))
+        I1.text((image.width + 32, 32), "Half Precision", font=font, fill=(255, 255, 255))
+        I1.text((image.width + image_half.width + 32, 32), "Step " + str(num_inference_steps), font=font, fill=(255, 255, 255))
+        dst.save(output_folder + "/" + str(idx) + "_" + str(num_inference_steps) + ".png")
+
+    list_ssim.append(sum(m_ssim) / len(m_ssim))
+    list_nmse.append(sum(m_nmse) / len(m_nmse))
+    list_psnr.append(sum(m_psnr) / len(m_psnr))
+
+print("ssim: ")
+print(list_ssim)
+
+print("nmse: ")
+print(list_nmse)
+
+print("psnr: ")
+print(list_psnr)