diff --git a/benchmark_results/latency_by_resolution.png b/benchmark_results/latency_by_resolution.png new file mode 100644 index 0000000..b980aa2 Binary files /dev/null and b/benchmark_results/latency_by_resolution.png differ diff --git a/benchmark_results/memory_usage_by_resolution.png b/benchmark_results/memory_usage_by_resolution.png new file mode 100644 index 0000000..cd09069 Binary files /dev/null and b/benchmark_results/memory_usage_by_resolution.png differ diff --git a/benchmark_results/plot_result.py b/benchmark_results/plot_result.py new file mode 100644 index 0000000..7bb6ad4 --- /dev/null +++ b/benchmark_results/plot_result.py @@ -0,0 +1,43 @@ +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +# Function to read and preprocess data +def read_and_preprocess(filename): + df = pd.read_csv(filename) + df = df[(df['latency'] != 'error') & (df['memory_usage'] != 'error')] + df['latency'] = pd.to_numeric(df['latency']) + df['memory_usage'] = pd.to_numeric(df['memory_usage']) + df['resolution'] = df['width'].astype(str) + 'x' + df['height'].astype(str) + return df + +# Load and preprocess the data +lpw_df = read_and_preprocess('results_LPW.csv') +sdxl_df = read_and_preprocess('results_SDXL.csv') + +# Set the style of seaborn +sns.set(style="whitegrid") + +# Plotting latency +plt.figure(figsize=(12, 6)) +sns.lineplot(data=pd.concat([lpw_df, sdxl_df]), x='resolution', y='latency', hue='pipeline_type', style='precision', markers=True, dashes=False) +plt.title('Latency by Resolution for Different Pipelines and Precisions') +plt.xlabel('Resolution (Width x Height)') +plt.ylabel('Latency (s)') +plt.xticks(rotation=45) +plt.legend(title='Pipeline / Precision') +plt.tight_layout() +plt.savefig('latency_by_resolution.png') # Save the figure as a PNG file +plt.show() + +# Plotting memory usage +plt.figure(figsize=(12, 6)) +sns.lineplot(data=pd.concat([lpw_df, sdxl_df]), x='resolution', y='memory_usage', hue='pipeline_type', style='precision', markers=True, dashes=False) +plt.title('Memory Usage by Resolution for Different Pipelines and Precisions') +plt.xlabel('Resolution (Width x Height)') +plt.ylabel('Memory Usage (GiB)') +plt.xticks(rotation=45) +plt.legend(title='Pipeline / Precision') +plt.tight_layout() +plt.savefig('memory_usage_by_resolution.png') # Save the figure as a PNG file +plt.show() diff --git a/benchmark_results/results_LPW.csv b/benchmark_results/results_LPW.csv new file mode 100644 index 0000000..ce1ccfe --- /dev/null +++ b/benchmark_results/results_LPW.csv @@ -0,0 +1,9 @@ +device,pipeline_type,precision,width,height,latency,memory_usage +NVIDIA RTX A4000,LPW,single,512,512,10.99,16.47 +NVIDIA RTX A4000,LPW,single,512,768,17.08,16.39 +NVIDIA RTX A4000,LPW,single,512,1024,23.58,16.37 +NVIDIA RTX A4000,LPW,single,1024,1024,error,error +NVIDIA RTX A4000,LPW,half,512,512,4.11,9.66 +NVIDIA RTX A4000,LPW,half,512,768,4.92,10.97 +NVIDIA RTX A4000,LPW,half,512,1024,5.93,11.9 +NVIDIA RTX A4000,LPW,half,1024,1024,12.06,16.14 diff --git a/benchmark_results/results_SDXL.csv b/benchmark_results/results_SDXL.csv new file mode 100644 index 0000000..a3a5a59 --- /dev/null +++ b/benchmark_results/results_SDXL.csv @@ -0,0 +1,9 @@ +device,pipeline_type,precision,width,height,latency,memory_usage +NVIDIA RTX A4000,SDXL,single,512,512,11.12,16.49 +NVIDIA RTX A4000,SDXL,single,512,768,17.46,16.39 +NVIDIA RTX A4000,SDXL,single,512,1024,24.04,16.37 +NVIDIA RTX A4000,SDXL,single,1024,1024,error,error +NVIDIA RTX A4000,SDXL,half,512,512,4.91,9.69 +NVIDIA RTX A4000,SDXL,half,512,768,5.52,10.97 +NVIDIA RTX A4000,SDXL,half,512,1024,6.01,11.88 +NVIDIA RTX A4000,SDXL,half,1024,1024,12.13,16.63 diff --git a/scripts/benchmark.py b/scripts/benchmark.py index b9a8299..5229e71 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -72,4 +72,4 @@ def run_benchmark_grid(): writer.writerow([device_desc, precision, width, height, "error", "error"]) if __name__ == "__main__": - run_benchmark_grid() + run_benchmark_grid() \ No newline at end of file diff --git a/scripts/benchmark_lpw.py b/scripts/benchmark_lpw.py new file mode 100644 index 0000000..20acc5e --- /dev/null +++ b/scripts/benchmark_lpw.py @@ -0,0 +1,105 @@ +import os +os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:4096' +import torch +import csv +import itertools +from diffusers import DiffusionPipeline +from torch.utils.benchmark import Timer + +device = torch.device("cuda:0") +prompt = "a photo of an astronaut riding a horse on mars" +neg_prompt = "blur, low quality, carton, animate" +num_inference_steps = 30 + +def get_inference_pipeline_SDXL(precision): + """ + Returns a Hugging Face diffusion pipeline for Stable Diffusion XL without LPW. + """ + assert precision in ("half", "single"), "Precision must be either 'half' or 'single'." + + pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + torch_dtype=torch.float32 if precision == "single" else torch.float16, + use_safetensors=True, + variant="fp16" + ) + return pipe.to(device) + +def get_inference_pipeline_LPW(precision): + """ + Returns a Hugging Face diffusion pipeline for Stable Diffusion XL with LPW enabled, + enhancing performance on long prompts. + """ + assert precision in ("half", "single"), "Precision must be either 'half' or 'single'." + + pipe = DiffusionPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-1.0", + torch_dtype=torch.float32 if precision == "single" else torch.float16, + use_safetensors=True, + custom_pipeline="lpw_stable_diffusion_xl", + variant="fp16" + ) + return pipe.to(device) + +def do_inference(pipe, width, height): + torch.cuda.empty_cache() + with torch.no_grad(): + images = pipe(prompt=prompt, negative_prompt=neg_prompt, width=width, height=height, num_inference_steps=num_inference_steps).images[0] + return images + +def get_inference_time(pipe, width, height): + timer = Timer( + stmt="do_inference(pipe, width, height)", + setup="from __main__ import do_inference", + globals={"pipe": pipe, "width": width, "height": height}, + ) + profile_result = timer.timeit(1) + return round(profile_result.mean, 2) + +def get_inference_memory(pipe, width, height): + do_inference(pipe, width, height) + mem = torch.cuda.memory_reserved(device=device) + return round(mem / 1e9, 2) + +@torch.inference_mode() +def run_benchmark(pipeline_type, precision, width, height): + if pipeline_type == 'SDXL': + pipe = get_inference_pipeline_SDXL(precision) + elif pipeline_type == 'LPW': + pipe = get_inference_pipeline_LPW(precision) + else: + raise ValueError("Invalid pipeline type") + + latency = get_inference_time(pipe, width, height) + memory_usage = get_inference_memory(pipe, width, height) + logs = {"pipeline_type": pipeline_type, "precision": precision, "width": width, "height": height, "latency": latency, "memory_usage": memory_usage} + print(logs) + print("============================") + return logs + +def get_device_description(): + return torch.cuda.get_device_name() + +def run_benchmark_grid(): + device_desc = get_device_description() + pipeline_types = ['LPW', 'SDXL'] + precision_options = ("single", "half") + image_sizes = [(512, 512), (512, 768), (512, 1024), (1024, 1024)] + + for pipeline_type in pipeline_types: + results_file = f"results_{pipeline_type}.csv" + with open(results_file, "w") as f: + writer = csv.writer(f) + writer.writerow(["device", "pipeline_type", "precision", "width", "height", "latency", "memory_usage"]) + + for precision, (width, height) in itertools.product(precision_options, image_sizes): + try: + log = run_benchmark(pipeline_type, precision, width, height) + writer.writerow([device_desc, pipeline_type, precision, width, height, log["latency"], log["memory_usage"]]) + except Exception as e: + print(f"Error with {pipeline_type}: {e}") + writer.writerow([device_desc, pipeline_type, precision, width, height, "error", "error"]) + + +if __name__ == "__main__": + run_benchmark_grid()