diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..ea7b0ed
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Lambda, Inc
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 0e2f381..7306ced 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,11 @@ _Additional models and pipelines for 🤗 Diffusers created by [Lambda Labs](htt
 - [Stable Diffusion Image Variations](#stable-diffusion-image-variations)
 - [Pokemon text to image](#pokemon-text-to-image)
 
+
+<p align="center">
+🦄 Other exciting ML projects at Lambda: <a href="https://news.lambdalabs.com/news/today">ML Times</a>, <a href="https://github.com/LambdaLabsML/distributed-training-guide/tree/main">Distributed Training Guide</a>, <a href="https://lambdalabsml.github.io/Open-Sora/introduction/">Text2Video</a>, <a href="https://lambdalabs.com/gpu-benchmarks">GPU Benchmark</a>.
+</p>
+
 ## Installation
 
 ```bash
@@ -31,21 +36,33 @@ A fine-tuned version of Stable Diffusion conditioned on CLIP image embeddings to
 ### Usage
 
 ```python
-from pathlib import Path
-from lambda_diffusers import StableDiffusionImageEmbedPipeline
+from diffusers import StableDiffusionImageVariationPipeline
 from PIL import Image
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-pipe = StableDiffusionImageEmbedPipeline.from_pretrained("lambdalabs/sd-image-variations-diffusers")
-pipe = pipe.to(device)
-im = Image.open("your/input/image/here.jpg")
-num_samples = 4
-image = pipe(num_samples*[im], guidance_scale=3.0)
-image = image["sample"]
-base_path = Path("outputs/im2im")
-base_path.mkdir(exist_ok=True, parents=True)
-for idx, im in enumerate(image):
-    im.save(base_path/f"{idx:06}.jpg")
+
+device = "cuda:0"
+sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+  "lambdalabs/sd-image-variations-diffusers",
+  revision="v2.0",
+  )
+sd_pipe = sd_pipe.to(device)
+
+im = Image.open("path/to/image.jpg")
+tform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize(
+        (224, 224),
+        interpolation=transforms.InterpolationMode.BICUBIC,
+        antialias=False,
+        ),
+    transforms.Normalize(
+      [0.48145466, 0.4578275, 0.40821073],
+      [0.26862954, 0.26130258, 0.27577711]),
+])
+inp = tform(im).to(device)
+
+out = sd_pipe(inp, guidance_scale=3)
+out["images"][0].save("result.jpg")
+
 ```
 
 ## Pokemon text to image
@@ -76,7 +93,7 @@ import torch
 from diffusers import StableDiffusionPipeline
 from torch import autocast
 
-pipe = StableDiffusionPipeline.from_pretrained("lambdalabs/sd-pokemon-diffusers", torch_dtype=torch.float16)  
+pipe = StableDiffusionPipeline.from_pretrained("lambdalabs/sd-pokemon-diffusers", torch_dtype=torch.float16)
 pipe = pipe.to("cuda")
 
 prompt = "Yoda"
@@ -101,29 +118,33 @@ for idx, im in enumerate(images):
 
 ## Benchmarking inference
 
-Detailed benchmark documentation can be found [here](./docs/benchmark.md).
+We have updated the original benchmark using xformers and a newer version of Diffusers, see the [new results here](./docs/benchmark-update.md) (original results can still be found [here](./docs/benchmark.md)).
 
-### Setup
+### Usage
 
-Before running the benchmark, make sure you have completed the repository [installation steps](#installation).
+Ensure that [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) is installed on your system and then run the following:
 
-You will then need to set the huggingface access token:
-1. Create a user account on HuggingFace and generate an access token.
-2. Set your huggingface access token as the `ACCESS_TOKEN` environment variable:
-```
-export ACCESS_TOKEN=<hf_...>
+```bash
+git clone https://github.com/LambdaLabsML/lambda-diffusers.git
+cd lambda-diffusers/scripts
+make bench
 ```
 
-### Usage
-
-Launch the benchmark script to append benchmark results to the existing [benchmark.csv](./benchmark.csv) results file:
-```
-python ./scripts/benchmark.py
-```
+Currently `xformers` does not support H100. The "without xformers" results below are generated by running the benchmark with `--xformers no` (can be set in `scripts/Makefile`)
 
 ### Results
 
-<img src="http://github.com/docs/pictures/pretty_benchmark_sd_txt2img_latency.png" alt="Stable Diffusion Text2Image Latency (seconds)" width="850"/>
+With [xformers](https://github.com/facebookresearch/xformers), raw data can be found [here](./benchmarks/benchmark.csv).
+![](./docs/pictures/sd_throughput.png)
+
+Without [xformers](https://github.com/facebookresearch/xformers), raw data can be found [here](./benchmarks/benchmark_no_xformers.csv).
+![](./docs/pictures/sd_throughput_noxformer.png)
+
+H100 MIG performance, raw data can be found [here](./benchmarks/benchmark_H100_MIG.csv).
+![](./docs/pictures/sd_throughput_mig.png)
+
+Cost analysis
+![](./docs/pictures/cost_analysis.png)
 
 ## Links
 
diff --git a/benchmark.csv b/benchmark.csv
deleted file mode 100644
index 5a5e51d..0000000
--- a/benchmark.csv
+++ /dev/null
@@ -1,58 +0,0 @@
-Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz,single,pytorch,1,458.97,0.0
-Intel(R) Core(TM) i7-6850K CPU @ 3.60GHz,single,onnx,1,286.13,0.0
-NVIDIA GeForce RTX 3090,single,pytorch,1,7.96,7.72
-NVIDIA GeForce RTX 3090,half,pytorch,1,4.83,4.54
-NVIDIA GeForce RTX 3090,single,pytorch,2,14.49,11
-NVIDIA GeForce RTX 3090,half,pytorch,2,8.42,8.75
-NVIDIA GeForce RTX 3090,single,pytorch,4,27.94,17.69
-NVIDIA GeForce RTX 3090,half,pytorch,4,15.87,15.36
-NVIDIA GeForce RTX 3090,single,pytorch,8,-1.0,-1.0
-NVIDIA GeForce RTX 3090,half,pytorch,8,-1.0,-1.0
-NVIDIA RTX A5500,single,pytorch,1,8.55,7.69
-NVIDIA RTX A5500,half,pytorch,1,5.05,4.58
-NVIDIA RTX A5500,single,pytorch,2,15.71,11
-NVIDIA RTX A5500,half,pytorch,2,9.37,8.8
-NVIDIA RTX A5500,single,pytorch,4,30.51,17.69
-NVIDIA RTX A5500,half,pytorch,4,16.97,15.33
-NVIDIA RTX A5500,single,pytorch,8,-1.0,-1.0
-NVIDIA RTX A5500,half,pytorch,8,-1.0,-1.0
-AMD EPYC 7352 24-Core Processor,single,pytorch,1,529.93,0.0
-AMD EPYC 7352 24-Core Processor,single,onnx,1,223.19,0.0
-NVIDIA GeForce RTX 3080,single,pytorch,4,-1.0,-1.0
-NVIDIA GeForce RTX 3080,half,pytorch,4,-1.0,-1.0
-NVIDIA GeForce RTX 3080,single,pytorch,1,-1.0,-1.0
-NVIDIA GeForce RTX 3080,half,pytorch,1,5.59,4.52
-NVIDIA GeForce RTX 3080,single,pytorch,2,-1.0,-1.0
-NVIDIA GeForce RTX 3080,half,pytorch,2,-1.0,-1.0
-NVIDIA A100 80GB PCIe,single,pytorch,1,6.39,7.75
-NVIDIA A100 80GB PCIe,half,pytorch,1,3.74,4.55
-NVIDIA A100 80GB PCIe,single,pytorch,2,11.12,11.05
-NVIDIA A100 80GB PCIe,half,pytorch,2,5.72,8.77
-NVIDIA A100 80GB PCIe,single,pytorch,4,20.18,17.63
-NVIDIA A100 80GB PCIe,half,pytorch,4,10.04,15.34
-NVIDIA A100 80GB PCIe,single,pytorch,8,38.88,30.88
-NVIDIA A100 80GB PCIe,half,pytorch,8,18.68,28.47
-NVIDIA A100 80GB PCIe,single,pytorch,16,76.92,57.46
-NVIDIA A100 80GB PCIe,half,pytorch,16,36.67,54.73
-NVIDIA A100 80GB PCIe,half,pytorch,28,63.88,78.78
-NVIDIA RTX A6000,single,pytorch,1,8.09,7.75
-NVIDIA RTX A6000,half,pytorch,1,5.03,4.53
-NVIDIA RTX A6000,single,pytorch,2,14.86,10.98
-NVIDIA RTX A6000,half,pytorch,2,9.03,8.79
-NVIDIA RTX A6000,single,pytorch,4,27.92,17.62
-NVIDIA RTX A6000,half,pytorch,4,17.0,15.34
-NVIDIA RTX A6000,single,pytorch,8,53.95,30.88
-NVIDIA RTX A6000,half,pytorch,8,32.57,28.51
-NVIDIA RTX A6000,half,pytorch,16,63.16,46.11
-Quadro RTX 8000,single,pytorch,1,12.3,7.71
-Quadro RTX 8000,half,pytorch,1,5.93,4.52
-Quadro RTX 8000,single,pytorch,2,24.42,9.16
-Quadro RTX 8000,half,pytorch,2,10.92,7.02
-Quadro RTX 8000,single,pytorch,4,42.56,15.58
-Quadro RTX 8000,half,pytorch,4,21.24,12.39
-Quadro RTX 8000,single,pytorch,8,76.96,23.11
-Quadro RTX 8000,half,pytorch,8,40.52,20.98
-Quadro RTX 8000,single,pytorch,16,152.55,42.47
-Quadro RTX 8000,half,pytorch,16,80.31,38.18
-Quadro RTX 8000,single,pytorch,32,-1.0,-1.0
-Quadro RTX 8000,half,pytorch,32,-1.0,-1.0
diff --git a/benchmarks/benchmark.csv b/benchmarks/benchmark.csv
new file mode 100644
index 0000000..aaa55ea
--- /dev/null
+++ b/benchmarks/benchmark.csv
@@ -0,0 +1,81 @@
+device,precision,autocast,xformers,runtime,n_samples,latency,memory
+NVIDIA A10,half,FALSE,TRUE,pytorch,1,2.01,3.13
+NVIDIA A10,single,FALSE,TRUE,pytorch,1,4.69,6.29
+NVIDIA A10,half,FALSE,TRUE,pytorch,2,3.65,4.3
+NVIDIA A10,single,FALSE,TRUE,pytorch,2,7.75,8.57
+NVIDIA A10,half,FALSE,TRUE,pytorch,4,6.68,6.63
+NVIDIA A10,single,FALSE,TRUE,pytorch,4,14.35,11.24
+NVIDIA A10,half,FALSE,TRUE,pytorch,8,12.93,11.05
+NVIDIA A10,single,FALSE,TRUE,pytorch,8,28.28,17.91
+NVIDIA A10,half,FALSE,TRUE,pytorch,16,24.65,19.86
+NVIDIA A10,single,FALSE,TRUE,pytorch,16,57.5,21.21
+NVIDIA A10,half,FALSE,TRUE,pytorch,32,48.79,7.37
+NVIDIA A10,single,FALSE,TRUE,pytorch,32,108.78,15.88
+NVIDIA A10,half,FALSE,TRUE,pytorch,64,108.26,17.54
+NVIDIA A10,single,FALSE,TRUE,pytorch,64,-1,-1
+NVIDIA A10,half,FALSE,TRUE,pytorch,128,212.94,22.18
+NVIDIA A10,single,FALSE,TRUE,pytorch,128,-1,-1
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,1,1.78,6.1
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,1,1.17,3.19
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,2,3.68,8.03
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,2,1.73,4.33
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,4,5.56,11.53
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,4,3.73,6.62
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,8,10.95,18.12
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,8,5.25,11.12
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,16,21.05,33.04
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,16,9.93,19.81
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,32,41.02,14.41
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,32,18.75,7.34
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,64,80.45,26.17
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,64,36.89,12.46
+NVIDIA A100 80GB PCIe,single,FALSE,TRUE,pytorch,128,161.52,48.01
+NVIDIA A100 80GB PCIe,half,FALSE,TRUE,pytorch,128,73.72,22.68
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,1,1.79,6.11
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,1,1.18,3.18
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,2,2.97,8.03
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,2,1.66,4.32
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,4,5.35,11.54
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,4,2.68,6.61
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,8,10.16,18.11
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,8,4.85,11.12
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,16,9.13,19.8
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,16,19.71,33.25
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,32,17.72,7.33
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,32,39.03,14.39
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,64,34.92,13.79
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,64,77.05,26.34
+NVIDIA A100-SXM4-40GB,half,FALSE,TRUE,pytorch,128,69.31,22.68
+NVIDIA A100-SXM4-40GB,single,FALSE,TRUE,pytorch,128,-1,-1
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,1,3.61,6.35
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,1,1.93,3.15
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,2,5.57,7.73
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,2,2.84,4.37
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,4,9.67,10.7
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,4,4.56,6.64
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,8,18.96,16.87
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,8,8.39,11.19
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,16,37.89,28.82
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,16,15.62,20.01
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,32,71.57,14.26
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,32,31.19,7.65
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,64,143.26,26.42
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,64,65.72,23.84
+NVIDIA RTX A6000,single,FALSE,TRUE,pytorch,128,287.96,47.92
+NVIDIA RTX A6000,half,FALSE,TRUE,pytorch,128,130.38,34.36
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,1,4.42,5.7
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,1,1.84,3.24
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,2,8.33,8.6
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,2,3.08,4.17
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,4,16.56,11.86
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,4,5.62,6.42
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,8,28.71,15.88
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,8,10.64,10.45
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,16,20.96,10.87
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,16,-1,-1
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,32,40.13,7.73
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,32,110.17,15.72
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,64,79.82,13.51
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,64,-1,-1
+Tesla V100-SXM2-16GB,single,FALSE,TRUE,pytorch,128,-1,-1
+Tesla V100-SXM2-16GB,half,FALSE,TRUE,pytorch,128,-1,-1
diff --git a/benchmarks/benchmark_H100_MIG.csv b/benchmarks/benchmark_H100_MIG.csv
new file mode 100644
index 0000000..87c70dd
--- /dev/null
+++ b/benchmarks/benchmark_H100_MIG.csv
@@ -0,0 +1,65 @@
+device,precision,autocast,xformers,runtime,n_samples,latency,memory,
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,1,1.73,7.7
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,1,1.06,3.46
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,2,2.66,9.79
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,2,1.73,4.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,4,4.47,18.49
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,4,2.63,8.91
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,8,8.16,23.86
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,8,4.97,12.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,16,15.98,42.38
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,16,9.61,29.01
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,32,32.04,80.51
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,32,19.07,55.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,1,2.3,7.74
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,1,1.52,3.45
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,2,3.95,9.48
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,2,2.42,4.57
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,4,7.12,18.2
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,4,4.17,8.9
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,8,13.91,23.75
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,8,7.91,12.49
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,16,15.73,29.01
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 4g.40gb,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,1,4.2,7.76
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,1,2.58,3.41
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,2,7.61,11.09
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,2,4.56,4.59
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,4,14.45,17.65
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,4,8.24,6.78
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,8,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,8,15.81,15.65
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 2g.20gb,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,1,9.17,7.76
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,1,5.39,3.47
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,2,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,2,9.29,4.63
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,4,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,4,17.4,6.8
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,8,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,8,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe MIG 1g.10gb,half,FALSE,FALSE,pytorch,128,-1,-1
\ No newline at end of file
diff --git a/benchmarks/benchmark_no_xformers.csv b/benchmarks/benchmark_no_xformers.csv
new file mode 100644
index 0000000..d578b6d
--- /dev/null
+++ b/benchmarks/benchmark_no_xformers.csv
@@ -0,0 +1,97 @@
+device,precision,autocast,xformers,runtime,n_samples,latency,memory,
+NVIDIA A10,single,FALSE,FALSE,pytorch,1,4.75,6.73
+NVIDIA A10,half,FALSE,FALSE,pytorch,1,2.71,3.43
+NVIDIA A10,single,FALSE,FALSE,pytorch,2,8.75,9
+NVIDIA A10,half,FALSE,FALSE,pytorch,2,4.99,5.53
+NVIDIA A10,single,FALSE,FALSE,pytorch,4,17.18,18.14
+NVIDIA A10,half,FALSE,FALSE,pytorch,4,9.65,6.84
+NVIDIA A10,single,FALSE,FALSE,pytorch,8,-1,-1
+NVIDIA A10,half,FALSE,FALSE,pytorch,8,18.58,12.66
+NVIDIA A10,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA A10,half,FALSE,FALSE,pytorch,16,36.32,20.64
+NVIDIA A10,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A10,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A10,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A10,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A10,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A10,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,1,1.72,7.76
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,1,1.18,3.41
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,2,3.03,9.04
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,2,1.88,5.53
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,4,5.53,18.04
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,4,3.35,6.74
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,8,10.95,23.85
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,8,6.28,12.6
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,16,12.57,20.58
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A100-SXM4-40GB,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A100-SXM4-40GB,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,1,1.99,7.76
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,1,1.5,3.45
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,2,3.52,11.11
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,2,2.3,4.53
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,4,6.31,13.98
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,4,4.04,8.91
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,8,12.21,23.91
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,8,7.59,12.75
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,16,-1,-1
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,16,14.54,21.24
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA A100-PCIE-40GB,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A100-PCIE-40GB,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,1,2.05,7.76
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,1,1.53,3.41
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,2,3.09,9.04
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,2,3.06,5.53
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,4,6.34,18.04
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,4,4.57,6.74
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,8,11.16,23.85
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,8,7.91,12.6
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,16,22.59,42.63
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,16,14.22,20.58
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,32,44.02,79.6
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,32,27.73,45.19
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,64,-1.0,-1.0
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,64,55.55,79.54
+NVIDIA A100 80GB PCIe,single,False,False,pytorch,128,-1.0,-1.0
+NVIDIA A100 80GB PCIe,half,False,False,pytorch,128,-1.0,-1.0
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,1,4.15,6.76
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,1,2.43,3.42
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,2,6,11.1
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,2,3.88,4.5
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,4,12.85,13.97
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,4,7.77,8.88
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,8,32.69,23.88
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,8,21.21,12.74
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,16,81.14,42.77
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,16,48.49,21.23
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,32,-1,-1
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA RTX A6000,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA RTX A6000,half,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,1,1.73,7.7
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,1,1.06,3.46
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,2,2.66,9.79
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,2,1.73,4.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,4,4.47,18.49
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,4,2.63,8.91
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,8,8.16,23.86
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,8,4.97,12.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,16,15.98,42.38
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,16,9.61,29.01
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,32,32.04,80.51
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,32,19.07,55.57
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,64,-1,-1
+NVIDIA H100 PCIe,single,FALSE,FALSE,pytorch,128,-1,-1
+NVIDIA H100 PCIe,half,FALSE,FALSE,pytorch,128,-1,-1
diff --git a/docs/benchmark-update.md b/docs/benchmark-update.md
new file mode 100644
index 0000000..b383e01
--- /dev/null
+++ b/docs/benchmark-update.md
@@ -0,0 +1,23 @@
+# Benchmark update
+
+We are currently running benchmarks to update our Stable Diffusion numbers using a more recent version of Diffusers and to take advantage of xformers. THe interim results on a limited set of GPUs are presented here.
+
+## Running the benchmark
+
+Ensure that [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) is installed on your system and then run the following:
+
+```bash
+git clone https://github.com/LambdaLabsML/lambda-diffusers.git
+cd lambda-diffusers/scripts
+make bench
+```
+
+Results will be written to `results.csv`, the benchmark will take different amounts of time depending on the GPU present but expect it to take at least several minutes.
+
+## Results
+
+The current results for the benchmark are available in [`benchmark.csv`](../benchmarks/benchmark.csv). These results were run with Diffusers 0.11.0 and xformers using Ubuntu 20.04, Python 3.8, PyTorch 1.13, CUDA 11.8 ([NGC PyTorch container 22.11](https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-11.html)).
+
+xformers provides a significant boost in performance and memory consumption allowing large batch sizes to maximise utilisation of GPUs. Our best performance comes using NVIDIA A100-SXM4-40GB on [Lambda GPU cloud](https://cloud.lambdalabs.com), at the maximum batch size tested (128) at half precision we observe a throughput of 1.85 images/second when using DDIM 30 steps for sampling.
+
+![](./pictures/sd_throughput.png)
\ No newline at end of file
diff --git a/docs/benchmark.md b/docs/benchmark.md
index 4109382..f16ea47 100644
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -1,5 +1,7 @@
 # Benchmarking Diffuser Models
 
+__We are currently in the process of updating our Stable Diffusion benchmark using more recent version of Diffusers and taking advantage of xformers. See the summary of interim result [here](./benchmark-update.md)__
+
 We present a benchmark of [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion) model inference.  This text2image model uses a text prompt as input and outputs an image of resolution `512x512`.
 
 Our experiments analyze inference performance in terms of speed, memory consumption, throughput, and quality of the output images. We look at how different choices in hardware (GPU model, GPU vs CPU) and software (single vs half precision, pytorch vs onnxruntime) affect inference performance.
@@ -27,10 +29,10 @@ We run these same inference jobs CPU devices to put in perspective the inference
 
 
 We note that:
-* GPUs are significantly faster -- by one or two orders of magnitudes depending on the precisions. 
+* GPUs are significantly faster -- by one or two orders of magnitudes depending on the precisions.
 * `onnxruntime` can reduce the latency for CPU by about `40%` to `50%`, depending on the type of CPUs.
 
-ONNX currently does not have [stable support](https://github.com/huggingface/diffusers/issues/489) for Huggingface diffusers.  
+ONNX currently does not have [stable support](https://github.com/huggingface/diffusers/issues/489) for Huggingface diffusers.
 We will investigate `onnxruntime-gpu` in future benchmarks.
 
 
@@ -62,10 +64,10 @@ We run a series of throughput experiment in pytorch with half-precision and usin
 
 We note:
 * Once again, A100 80GB is the top performer and has the highest throughput.
-* The gap between A100 80GB and other cards in terms of throughput can be explained by the larger maximum batch size that can be used on this card. 
+* The gap between A100 80GB and other cards in terms of throughput can be explained by the larger maximum batch size that can be used on this card.
 
 
-As a concrete example, the chart below shows how A100 80GB's throughput increases by `64%` when we changed the batch size from 1 to 28 (the largest without causing an out of memory error). It is also interesting to see that the increase is not linear and flattens out when batch size reaches a certain value, at which point the tensor cores on the GPU are saturated and any new data in the GPU memory will have to be queued up before getting their own computing resources. 
+As a concrete example, the chart below shows how A100 80GB's throughput increases by `64%` when we changed the batch size from 1 to 28 (the largest without causing an out of memory error). It is also interesting to see that the increase is not linear and flattens out when batch size reaches a certain value, at which point the tensor cores on the GPU are saturated and any new data in the GPU memory will have to be queued up before getting their own computing resources.
 
 <img src="http://github.com/pictures/pretty_benchmark_sd_txt2img_batchsize_vs_throughput.png" alt="Stable Diffusion Text2Image Batch size vs Throughput (images/minute)" width="380"/>
 
@@ -76,7 +78,7 @@ We are curious about whether half-precision introduces degradations to the quali
 
 ![Evolution of precision v degradation across 100 steps](./pictures/benchmark_sd_precision_history.gif)
 
-Our observation is that there are indeed visible differences between the single-precision output and the half-precision output, especially in the early steps. The differences often decrease with the number of steps, but might not always vanish. 
+Our observation is that there are indeed visible differences between the single-precision output and the half-precision output, especially in the early steps. The differences often decrease with the number of steps, but might not always vanish.
 
 Interestingly, such a difference may not imply artifacts in half-precision's outputs. For example, in step 70, the picture below shows half-precision didn't produce the artifact in the single-precision output (an extra front leg):
 
@@ -162,7 +164,7 @@ sudo docker run --rm --gpus all nvidia/cuda:11.2.1-base-ubuntu20.04 nvidia-smi
 3. Build the benchmark docker image
 
 ```
-docker build -t benchmark -f ./benchmarking/Dockerfile .   
+docker build -t benchmark -f ./benchmarking/Dockerfile .
 ```
 
 #### Running the benchmark
diff --git a/docs/pictures/cost_analysis.png b/docs/pictures/cost_analysis.png
new file mode 100644
index 0000000..2b5a473
Binary files /dev/null and b/docs/pictures/cost_analysis.png differ
diff --git a/docs/pictures/sd_throughput.png b/docs/pictures/sd_throughput.png
new file mode 100644
index 0000000..06e0b15
Binary files /dev/null and b/docs/pictures/sd_throughput.png differ
diff --git a/docs/pictures/sd_throughput_mig.png b/docs/pictures/sd_throughput_mig.png
new file mode 100644
index 0000000..5e813a1
Binary files /dev/null and b/docs/pictures/sd_throughput_mig.png differ
diff --git a/docs/pictures/sd_throughput_noxformer.png b/docs/pictures/sd_throughput_noxformer.png
new file mode 100644
index 0000000..baae962
Binary files /dev/null and b/docs/pictures/sd_throughput_noxformer.png differ
diff --git a/scripts/Dockerfile b/scripts/Dockerfile
new file mode 100644
index 0000000..beb9793
--- /dev/null
+++ b/scripts/Dockerfile
@@ -0,0 +1,13 @@
+FROM nvcr.io/nvidia/pytorch:22.11-py3
+
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+
+RUN pip install --pre xformers
+RUN pip install diffusers==0.11.0 accelerate transformers
+
+WORKDIR /workspace
+
+COPY benchmark.py /workspace/benchmark.py
+RUN (printf '#!/bin/bash\npython benchmark.py \"$@\"\n' >> /entry.sh) && chmod a+x /entry.sh
+ENTRYPOINT ["/entry.sh"]
\ No newline at end of file
diff --git a/scripts/Makefile b/scripts/Makefile
new file mode 100644
index 0000000..9b722fb
--- /dev/null
+++ b/scripts/Makefile
@@ -0,0 +1,17 @@
+bench:
+	docker build -t sd-bench .
+	docker run \
+		--rm -it \
+		--gpus all \
+		--shm-size=128g \
+		--net=host \
+		-v $(PWD):/workspace/results \
+		sd-bench \
+		--steps 30 \
+		--samples 1,2,4,8,16,32,64,128 \
+		--autocast no \
+		--xformers yes \
+		--output_file /workspace/results/results.csv
+
+clean:
+	rm results.csv
\ No newline at end of file
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 4e94b8d..eef4818 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -5,6 +5,7 @@
 import pathlib
 import csv
 from contextlib import nullcontext
+import itertools
 import torch
 from torch import autocast
 from diffusers import StableDiffusionPipeline, StableDiffusionOnnxPipeline
@@ -13,12 +14,18 @@
 
 prompt = "a photo of an astronaut riding a horse on mars"
 
+def make_bool(yes_or_no):
+    if yes_or_no.lower() == "yes":
+        return True
+    elif yes_or_no.lower() == "no":
+        return False
+    else:
+        raise ValueError(f"unrecognised input {yes_or_no}")
 
 def get_inference_pipeline(precision, backend):
     """
     returns HuggingFace diffuser pipeline
     cf https://github.com/huggingface/diffusers#text-to-image-generation-with-stable-diffusion
-    note: could not download from CompVis/stable-diffusion-v1-4 (access restricted)
     """
 
     assert precision in ("half", "single"), "precision in ['half', 'single']"
@@ -28,7 +35,6 @@ def get_inference_pipeline(precision, backend):
         pipe = StableDiffusionPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
             revision="main" if precision == "single" else "fp16",
-            use_auth_token=os.environ["ACCESS_TOKEN"],
             torch_dtype=torch.float32 if precision == "single" else torch.float16,
         )
         pipe = pipe.to(device)
@@ -103,9 +109,9 @@ def get_inference_memory(pipe, n_samples, use_autocast, num_inference_steps):
     mem = torch.cuda.memory_reserved()
     return round(mem / 1e9, 2)
 
-
+@torch.inference_mode()
 def run_benchmark(
-    n_repeats, n_samples, precision, use_autocast, backend, num_inference_steps
+    n_repeats, n_samples, precision, use_autocast, xformers, backend, num_inference_steps
 ):
     """
     * n_repeats: nb datapoints for inference latency benchmark
@@ -116,7 +122,14 @@ def run_benchmark(
     dict like {'memory usage': 17.70, 'latency': 86.71'}
     """
 
+    print(f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\txformers: {xformers}\tbackend: {backend}")
+
     pipe = get_inference_pipeline(precision, backend)
+    if xformers:
+        pipe.enable_xformers_memory_efficient_attention()
+
+    if n_samples>16:
+        pipe.enable_vae_slicing()
 
     logs = {
         "memory": 0.00
@@ -128,8 +141,8 @@ def run_benchmark(
             pipe, n_samples, n_repeats, use_autocast, num_inference_steps
         ),
     }
-    print(f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\tbackend: {backend}")
     print(logs, "\n")
+    print("============================")
     return logs
 
 
@@ -148,7 +161,7 @@ def get_device_description():
         return torch.cuda.get_device_name()
 
 
-def run_benchmark_grid(grid, n_repeats, num_inference_steps):
+def run_benchmark_grid(grid, n_repeats, num_inference_steps, csv_fpath):
     """
     * grid : dict like
         {
@@ -159,13 +172,13 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
     * n_repeats: nb datapoints for inference latency benchmark
     """
 
-    csv_fpath = pathlib.Path(__file__).parent.parent / "benchmark_tmp.csv"
     # create benchmark.csv if not exists
     if not os.path.isfile(csv_fpath):
         header = [
             "device",
             "precision",
             "autocast",
+            "xformers"
             "runtime",
             "n_samples",
             "latency",
@@ -179,45 +192,45 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
     with open(csv_fpath, "a") as f:
         writer = csv.writer(f)
         device_desc = get_device_description()
-        for n_samples in grid["n_samples"]:
-            for precision in grid["precision"]:
-                use_autocast = False
-                if precision == "half":
-                    for autocast in grid["autocast"]:
-                        if autocast == "yes":
-                            use_autocast = True
-                        for backend in grid["backend"]:
-                            try:
-                                new_log = run_benchmark(
-                                    n_repeats=n_repeats,
-                                    n_samples=n_samples,
-                                    precision=precision,
-                                    use_autocast=use_autocast,
-                                    backend=backend,
-                                    num_inference_steps=num_inference_steps,
-                                )
-                            except Exception as e:
-                                if "CUDA out of memory" in str(
-                                    e
-                                ) or "Failed to allocate memory" in str(e):
-                                    print(str(e))
-                                    torch.cuda.empty_cache()
-                                    new_log = {"latency": -1.00, "memory": -1.00}
-                                else:
-                                    raise e
-
-                            latency = new_log["latency"]
-                            memory = new_log["memory"]
-                            new_row = [
-                                device_desc,
-                                precision,
-                                autocast,
-                                backend,
-                                n_samples,
-                                latency,
-                                memory,
-                            ]
-                            writer.writerow(new_row)
+        for trial in itertools.product(*grid.values()):
+
+            n_samples, precision, use_autocast, xformers, backend = trial
+            use_autocast = make_bool(use_autocast)
+            xformers = make_bool(xformers)
+
+            try:
+                new_log = run_benchmark(
+                    n_repeats=n_repeats,
+                    n_samples=n_samples,
+                    precision=precision,
+                    use_autocast=use_autocast,
+                    xformers=xformers,
+                    backend=backend,
+                    num_inference_steps=num_inference_steps,
+                )
+            except Exception as e:
+                if "CUDA out of memory" in str(
+                    e
+                ) or "Failed to allocate memory" in str(e):
+                    print(str(e))
+                    torch.cuda.empty_cache()
+                    new_log = {"latency": -1.00, "memory": -1.00}
+                else:
+                    raise e
+
+            latency = new_log["latency"]
+            memory = new_log["memory"]
+            new_row = [
+                device_desc,
+                precision,
+                use_autocast,
+                xformers,
+                backend,
+                n_samples,
+                latency,
+                memory,
+            ]
+            writer.writerow(new_row)
 
 
 if __name__ == "__main__":
@@ -249,6 +262,20 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
         help="If 'yes', will perform additional runs with autocast activated for half precision inferences",
     )
 
+    parser.add_argument(
+        "--xformers",
+        default="yes",
+        type=str,
+        help="If 'yes', will use xformers flash attention",
+    )
+
+    parser.add_argument(
+        "--output_file",
+        default="results.py",
+        type=str,
+        help="Path to output csv file to write",
+    )
+
     args = parser.parse_args()
 
     grid = {
@@ -257,10 +284,11 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
         # Remove autocast won't help. Ref:
         # https://github.com/CompVis/stable-diffusion/issues/307
         "precision": ("single",) if device.type == "cpu" else ("single", "half"),
-        "autocast": ("no",) if args.autocast == "no" else ("yes", "no"),
+        "autocast": args.autocast.split(","),
+        "xformers": args.xformers.split(","),
         # Only use onnx for cpu, until issues are fixed by upstreams. Ref:
         # https://github.com/huggingface/diffusers/issues/489#issuecomment-1261577250
         # https://github.com/huggingface/diffusers/pull/440
         "backend": ("pytorch", "onnx") if device.type == "cpu" else ("pytorch",),
     }
-    run_benchmark_grid(grid, n_repeats=args.repeats, num_inference_steps=args.steps)
+    run_benchmark_grid(grid, n_repeats=args.repeats, num_inference_steps=args.steps, csv_fpath=args.output_file)