Skip to content

Commit 2c1e19e

Browse files
committed
cuda autocast y/n argument
1 parent 90bd1b4 commit 2c1e19e

File tree

1 file changed

+107
-55
lines changed

1 file changed

+107
-55
lines changed

scripts/benchmark.py

Lines changed: 107 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pathlib
66
import csv
77
from contextlib import nullcontext
8+
from sqlalchemy import false
89
import torch
910
from torch import autocast
1011
from diffusers import StableDiffusionPipeline, StableDiffusionOnnxPipeline
@@ -13,6 +14,7 @@
1314

1415
prompt = "a photo of an astronaut riding a horse on mars"
1516

17+
1618
def get_inference_pipeline(precision, backend):
1719
"""
1820
returns HuggingFace diffuser pipeline
@@ -31,12 +33,14 @@ def get_inference_pipeline(precision, backend):
3133
torch_dtype=torch.float32 if precision == "single" else torch.float16,
3234
)
3335
pipe = pipe.to(device)
34-
else:
36+
else:
3537
pipe = StableDiffusionOnnxPipeline.from_pretrained(
3638
"CompVis/stable-diffusion-v1-4",
3739
use_auth_token=os.environ["ACCESS_TOKEN"],
3840
revision="onnx",
39-
provider="CPUExecutionProvider" if device.type=="cpu" else "CUDAExecutionProvider",
41+
provider="CPUExecutionProvider"
42+
if device.type == "cpu"
43+
else "CUDAExecutionProvider",
4044
torch_dtype=torch.float32 if precision == "single" else torch.float16,
4145
)
4246

@@ -51,43 +55,59 @@ def null_safety(images, **kwargs):
5155
return pipe
5256

5357

54-
def do_inference(pipe, n_samples, precision, num_inference_steps):
58+
def do_inference(pipe, n_samples, use_autocast, num_inference_steps):
5559
torch.cuda.empty_cache()
56-
context = autocast if (device.type == "cuda" and precision == 'half') else nullcontext
60+
context = (
61+
autocast if (device.type == "cuda" and use_autocast) else nullcontext
62+
)
5763
with context("cuda"):
58-
images = pipe(prompt=[prompt] * n_samples, num_inference_steps=num_inference_steps).images
64+
images = pipe(
65+
prompt=[prompt] * n_samples, num_inference_steps=num_inference_steps
66+
).images
5967

6068
return images
6169

6270

63-
def get_inference_time(pipe, n_samples, n_repeats, precision, num_inference_steps):
71+
def get_inference_time(
72+
pipe, n_samples, n_repeats, use_autocast, num_inference_steps
73+
):
6474
from torch.utils.benchmark import Timer
75+
6576
timer = Timer(
66-
stmt="do_inference(pipe, n_samples, precision, num_inference_steps)",
77+
stmt="do_inference(pipe, n_samples, use_autocast, num_inference_steps)",
6778
setup="from __main__ import do_inference",
68-
globals={"pipe": pipe, "n_samples": n_samples, "precision": precision, "num_inference_steps": num_inference_steps},
69-
num_threads=multiprocessing.cpu_count()
79+
globals={
80+
"pipe": pipe,
81+
"n_samples": n_samples,
82+
"use_autocast": use_autocast,
83+
"num_inference_steps": num_inference_steps,
84+
},
85+
num_threads=multiprocessing.cpu_count(),
7086
)
7187
profile_result = timer.timeit(
7288
n_repeats
7389
) # benchmark.Timer performs 2 iterations for warmup
7490
return round(profile_result.mean, 2)
7591

7692

77-
def get_inference_memory(pipe, n_samples, precision, num_inference_steps):
93+
def get_inference_memory(pipe, n_samples, use_autocast, num_inference_steps):
7894
if not torch.cuda.is_available():
7995
return 0
80-
96+
8197
torch.cuda.empty_cache()
82-
context = autocast if (device.type == "cuda" and precision == 'half') else nullcontext
98+
context = autocast if (device.type == "cuda" and use_autocast) else nullcontext
8399
with context("cuda"):
84-
images = pipe(prompt=[prompt] * n_samples, num_inference_steps=num_inference_steps).images
100+
images = pipe(
101+
prompt=[prompt] * n_samples, num_inference_steps=num_inference_steps
102+
).images
85103

86104
mem = torch.cuda.memory_reserved()
87105
return round(mem / 1e9, 2)
88106

89107

90-
def run_benchmark(n_repeats, n_samples, precision, backend, num_inference_steps):
108+
def run_benchmark(
109+
n_repeats, n_samples, precision, use_autocast, backend, num_inference_steps
110+
):
91111
"""
92112
* n_repeats: nb datapoints for inference latency benchmark
93113
* n_samples: number of samples to generate (~ batch size)
@@ -100,10 +120,16 @@ def run_benchmark(n_repeats, n_samples, precision, backend, num_inference_steps)
100120
pipe = get_inference_pipeline(precision, backend)
101121

102122
logs = {
103-
"memory": 0.00 if device.type=="cpu" else get_inference_memory(pipe, n_samples, precision, num_inference_steps),
104-
"latency": get_inference_time(pipe, n_samples, n_repeats, precision, num_inference_steps),
123+
"memory": 0.00
124+
if device.type == "cpu"
125+
else get_inference_memory(
126+
pipe, n_samples, use_autocast, num_inference_steps
127+
),
128+
"latency": get_inference_time(
129+
pipe, n_samples, n_repeats, use_autocast, num_inference_steps
130+
),
105131
}
106-
print(f"n_samples: {n_samples}\tprecision: {precision}\tbackend: {backend}")
132+
print(f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\tbackend: {backend}")
107133
print(logs, "\n")
108134
return logs
109135

@@ -115,9 +141,8 @@ def get_device_description():
115141
"""
116142
if device.type == "cpu":
117143
name = subprocess.check_output(
118-
"grep -m 1 'model name' /proc/cpuinfo",
119-
shell=True
120-
).decode("utf-8")
144+
"grep -m 1 'model name' /proc/cpuinfo", shell=True
145+
).decode("utf-8")
121146
name = " ".join(name.split(" ")[2:]).strip()
122147
return name
123148
else:
@@ -130,14 +155,23 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
130155
{
131156
"n_samples": (1, 2),
132157
"precision": ("single", "half"),
158+
"autocast" : ("yes", "no")
133159
}
134160
* n_repeats: nb datapoints for inference latency benchmark
135161
"""
136162

137163
csv_fpath = pathlib.Path(__file__).parent.parent / "benchmark_tmp.csv"
138164
# create benchmark.csv if not exists
139165
if not os.path.isfile(csv_fpath):
140-
header = ["device", "precision", "runtime", "n_samples", "latency", "memory"]
166+
header = [
167+
"device",
168+
"precision",
169+
"autocast",
170+
"runtime",
171+
"n_samples",
172+
"latency",
173+
"memory",
174+
]
141175
with open(csv_fpath, "w") as f:
142176
writer = csv.writer(f)
143177
writer.writerow(header)
@@ -148,48 +182,58 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
148182
device_desc = get_device_description()
149183
for n_samples in grid["n_samples"]:
150184
for precision in grid["precision"]:
151-
for backend in grid["backend"]:
152-
try:
153-
new_log = run_benchmark(
154-
n_repeats=n_repeats,
155-
n_samples=n_samples,
156-
precision=precision,
157-
backend=backend,
158-
num_inference_steps=num_inference_steps
159-
)
160-
except Exception as e:
161-
if "CUDA out of memory" in str(e) or "Failed to allocate memory" in str(e):
162-
print(str(e))
163-
torch.cuda.empty_cache()
164-
new_log = {
165-
"latency": -1.00,
166-
"memory": -1.00
167-
}
168-
else:
169-
raise e
170-
171-
latency = new_log["latency"]
172-
memory = new_log["memory"]
173-
new_row = [device_desc, precision, backend, n_samples, latency, memory]
174-
writer.writerow(new_row)
185+
use_autocast = False
186+
if precision == "half":
187+
for autocast in grid["autocast"]:
188+
if autocast == "yes":
189+
use_autocast = True
190+
for backend in grid["backend"]:
191+
try:
192+
new_log = run_benchmark(
193+
n_repeats=n_repeats,
194+
n_samples=n_samples,
195+
precision=precision,
196+
use_autocast=use_autocast,
197+
backend=backend,
198+
num_inference_steps=num_inference_steps,
199+
)
200+
except Exception as e:
201+
if "CUDA out of memory" in str(
202+
e
203+
) or "Failed to allocate memory" in str(e):
204+
print(str(e))
205+
torch.cuda.empty_cache()
206+
new_log = {"latency": -1.00, "memory": -1.00}
207+
else:
208+
raise e
209+
210+
latency = new_log["latency"]
211+
memory = new_log["memory"]
212+
new_row = [
213+
device_desc,
214+
precision,
215+
autocast,
216+
backend,
217+
n_samples,
218+
latency,
219+
memory,
220+
]
221+
writer.writerow(new_row)
175222

176223

177224
if __name__ == "__main__":
178225

179226
parser = argparse.ArgumentParser()
180227

181228
parser.add_argument(
182-
"--samples",
229+
"--samples",
183230
default="1",
184-
type=str,
185-
help="Comma sepearated list of batch sizes (number of samples)"
231+
type=str,
232+
help="Comma sepearated list of batch sizes (number of samples)",
186233
)
187234

188235
parser.add_argument(
189-
"--steps",
190-
default=50,
191-
type=int,
192-
help="Number of diffusion steps."
236+
"--steps", default=50, type=int, help="Number of diffusion steps."
193237
)
194238

195239
parser.add_argument(
@@ -199,17 +243,25 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
199243
help="Number of repeats.",
200244
)
201245

246+
parser.add_argument(
247+
"--autocast",
248+
default="no",
249+
type=str,
250+
help="If 'yes', will perform additional runs with autocast activated for half precision inferences",
251+
)
252+
202253
args = parser.parse_args()
203254

204255
grid = {
205-
"n_samples": tuple(map(int, args.samples.split(","))),
206-
# Only use single-precision for cpu because "LayerNormKernelImpl" not implemented for 'Half' on cpu,
256+
"n_samples": tuple(map(int, args.samples.split(","))),
257+
# Only use single-precision for cpu because "LayerNormKernelImpl" not implemented for 'Half' on cpu,
207258
# Remove autocast won't help. Ref:
208259
# https://github.com/CompVis/stable-diffusion/issues/307
209260
"precision": ("single",) if device.type == "cpu" else ("single", "half"),
261+
"autocast": ("no",) if args.autocast == "no" else ("yes", "no"),
210262
# Only use onnx for cpu, until issues are fixed by upstreams. Ref:
211263
# https://github.com/huggingface/diffusers/issues/489#issuecomment-1261577250
212264
# https://github.com/huggingface/diffusers/pull/440
213-
"backend": ("pytorch", "onnx") if device.type == "cpu" else ("pytorch",)
265+
"backend": ("pytorch", "onnx") if device.type == "cpu" else ("pytorch",),
214266
}
215267
run_benchmark_grid(grid, n_repeats=args.repeats, num_inference_steps=args.steps)

0 commit comments

Comments
 (0)