diff --git a/benchmark_results/latency_by_resolution.png b/benchmark_results/latency_by_resolution.png
new file mode 100644
index 0000000..b980aa2
Binary files /dev/null and b/benchmark_results/latency_by_resolution.png differ
diff --git a/benchmark_results/memory_usage_by_resolution.png b/benchmark_results/memory_usage_by_resolution.png
new file mode 100644
index 0000000..cd09069
Binary files /dev/null and b/benchmark_results/memory_usage_by_resolution.png differ
diff --git a/benchmark_results/plot_result.py b/benchmark_results/plot_result.py
new file mode 100644
index 0000000..7bb6ad4
--- /dev/null
+++ b/benchmark_results/plot_result.py
@@ -0,0 +1,43 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Function to read and preprocess data
+def read_and_preprocess(filename):
+    df = pd.read_csv(filename)
+    df = df[(df['latency'] != 'error') & (df['memory_usage'] != 'error')]
+    df['latency'] = pd.to_numeric(df['latency'])
+    df['memory_usage'] = pd.to_numeric(df['memory_usage'])
+    df['resolution'] = df['width'].astype(str) + 'x' + df['height'].astype(str)
+    return df
+
+# Load and preprocess the data
+lpw_df = read_and_preprocess('results_LPW.csv')
+sdxl_df = read_and_preprocess('results_SDXL.csv')
+
+# Set the style of seaborn
+sns.set(style="whitegrid")
+
+# Plotting latency
+plt.figure(figsize=(12, 6))
+sns.lineplot(data=pd.concat([lpw_df, sdxl_df]), x='resolution', y='latency', hue='pipeline_type', style='precision', markers=True, dashes=False)
+plt.title('Latency by Resolution for Different Pipelines and Precisions')
+plt.xlabel('Resolution (Width x Height)')
+plt.ylabel('Latency (s)')
+plt.xticks(rotation=45)
+plt.legend(title='Pipeline / Precision')
+plt.tight_layout()
+plt.savefig('latency_by_resolution.png')  # Save the figure as a PNG file
+plt.show()
+
+# Plotting memory usage
+plt.figure(figsize=(12, 6))
+sns.lineplot(data=pd.concat([lpw_df, sdxl_df]), x='resolution', y='memory_usage', hue='pipeline_type', style='precision', markers=True, dashes=False)
+plt.title('Memory Usage by Resolution for Different Pipelines and Precisions')
+plt.xlabel('Resolution (Width x Height)')
+plt.ylabel('Memory Usage (GiB)')
+plt.xticks(rotation=45)
+plt.legend(title='Pipeline / Precision')
+plt.tight_layout()
+plt.savefig('memory_usage_by_resolution.png')  # Save the figure as a PNG file
+plt.show()
diff --git a/benchmark_results/results_LPW.csv b/benchmark_results/results_LPW.csv
new file mode 100644
index 0000000..ce1ccfe
--- /dev/null
+++ b/benchmark_results/results_LPW.csv
@@ -0,0 +1,9 @@
+device,pipeline_type,precision,width,height,latency,memory_usage
+NVIDIA RTX A4000,LPW,single,512,512,10.99,16.47
+NVIDIA RTX A4000,LPW,single,512,768,17.08,16.39
+NVIDIA RTX A4000,LPW,single,512,1024,23.58,16.37
+NVIDIA RTX A4000,LPW,single,1024,1024,error,error
+NVIDIA RTX A4000,LPW,half,512,512,4.11,9.66
+NVIDIA RTX A4000,LPW,half,512,768,4.92,10.97
+NVIDIA RTX A4000,LPW,half,512,1024,5.93,11.9
+NVIDIA RTX A4000,LPW,half,1024,1024,12.06,16.14
diff --git a/benchmark_results/results_SDXL.csv b/benchmark_results/results_SDXL.csv
new file mode 100644
index 0000000..a3a5a59
--- /dev/null
+++ b/benchmark_results/results_SDXL.csv
@@ -0,0 +1,9 @@
+device,pipeline_type,precision,width,height,latency,memory_usage
+NVIDIA RTX A4000,SDXL,single,512,512,11.12,16.49
+NVIDIA RTX A4000,SDXL,single,512,768,17.46,16.39
+NVIDIA RTX A4000,SDXL,single,512,1024,24.04,16.37
+NVIDIA RTX A4000,SDXL,single,1024,1024,error,error
+NVIDIA RTX A4000,SDXL,half,512,512,4.91,9.69
+NVIDIA RTX A4000,SDXL,half,512,768,5.52,10.97
+NVIDIA RTX A4000,SDXL,half,512,1024,6.01,11.88
+NVIDIA RTX A4000,SDXL,half,1024,1024,12.13,16.63
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index b9a8299..5229e71 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -72,4 +72,4 @@ def run_benchmark_grid():
                 writer.writerow([device_desc, precision, width, height, "error", "error"])
 
 if __name__ == "__main__":
-    run_benchmark_grid()
+    run_benchmark_grid()
\ No newline at end of file
diff --git a/scripts/benchmark_lpw.py b/scripts/benchmark_lpw.py
new file mode 100644
index 0000000..20acc5e
--- /dev/null
+++ b/scripts/benchmark_lpw.py
@@ -0,0 +1,105 @@
+import os
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:4096'
+import torch
+import csv
+import itertools
+from diffusers import DiffusionPipeline
+from torch.utils.benchmark import Timer
+
+device = torch.device("cuda:0")
+prompt = "a photo of an astronaut riding a horse on mars"
+neg_prompt = "blur, low quality, carton, animate"
+num_inference_steps = 30
+
+def get_inference_pipeline_SDXL(precision):
+    """
+    Returns a Hugging Face diffusion pipeline for Stable Diffusion XL without LPW.
+    """
+    assert precision in ("half", "single"), "Precision must be either 'half' or 'single'."
+    
+    pipe = DiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-xl-base-1.0",
+        torch_dtype=torch.float32 if precision == "single" else torch.float16,
+        use_safetensors=True,
+        variant="fp16"
+    )
+    return pipe.to(device)
+
+def get_inference_pipeline_LPW(precision):
+    """
+    Returns a Hugging Face diffusion pipeline for Stable Diffusion XL with LPW enabled,
+    enhancing performance on long prompts.
+    """
+    assert precision in ("half", "single"), "Precision must be either 'half' or 'single'."
+    
+    pipe = DiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-xl-base-1.0",
+        torch_dtype=torch.float32 if precision == "single" else torch.float16,
+        use_safetensors=True,
+        custom_pipeline="lpw_stable_diffusion_xl",
+        variant="fp16"
+    )
+    return pipe.to(device)
+
+def do_inference(pipe, width, height):
+    torch.cuda.empty_cache()
+    with torch.no_grad():
+        images = pipe(prompt=prompt, negative_prompt=neg_prompt, width=width, height=height, num_inference_steps=num_inference_steps).images[0]
+    return images
+
+def get_inference_time(pipe, width, height):
+    timer = Timer(
+        stmt="do_inference(pipe, width, height)",
+        setup="from __main__ import do_inference",
+        globals={"pipe": pipe, "width": width, "height": height},
+    )
+    profile_result = timer.timeit(1)
+    return round(profile_result.mean, 2)
+
+def get_inference_memory(pipe, width, height):
+    do_inference(pipe, width, height)
+    mem = torch.cuda.memory_reserved(device=device)
+    return round(mem / 1e9, 2)
+
+@torch.inference_mode()
+def run_benchmark(pipeline_type, precision, width, height):
+    if pipeline_type == 'SDXL':
+        pipe = get_inference_pipeline_SDXL(precision)
+    elif pipeline_type == 'LPW':
+        pipe = get_inference_pipeline_LPW(precision)
+    else:
+        raise ValueError("Invalid pipeline type")
+    
+    latency = get_inference_time(pipe, width, height)
+    memory_usage = get_inference_memory(pipe, width, height)
+    logs = {"pipeline_type": pipeline_type, "precision": precision, "width": width, "height": height, "latency": latency, "memory_usage": memory_usage}
+    print(logs)
+    print("============================")
+    return logs
+
+def get_device_description():
+    return torch.cuda.get_device_name()
+
+def run_benchmark_grid():
+    device_desc = get_device_description()
+    pipeline_types = ['LPW', 'SDXL']
+    precision_options = ("single", "half")
+    image_sizes = [(512, 512), (512, 768), (512, 1024), (1024, 1024)]
+
+    for pipeline_type in pipeline_types:
+        results_file = f"results_{pipeline_type}.csv"
+        with open(results_file, "w") as f:
+            writer = csv.writer(f)
+            writer.writerow(["device", "pipeline_type", "precision", "width", "height", "latency", "memory_usage"])
+
+            for precision, (width, height) in itertools.product(precision_options, image_sizes):
+                try:
+                    log = run_benchmark(pipeline_type, precision, width, height)
+                    writer.writerow([device_desc, pipeline_type, precision, width, height, log["latency"], log["memory_usage"]])
+                except Exception as e:
+                    print(f"Error with {pipeline_type}: {e}")
+                    writer.writerow([device_desc, pipeline_type, precision, width, height, "error", "error"])
+
+
+if __name__ == "__main__":
+    run_benchmark_grid()