55import pathlib
66import csv
77from contextlib import nullcontext
8+ from sqlalchemy import false
89import torch
910from torch import autocast
1011from diffusers import StableDiffusionPipeline , StableDiffusionOnnxPipeline
1314
1415prompt = "a photo of an astronaut riding a horse on mars"
1516
17+
1618def get_inference_pipeline (precision , backend ):
1719 """
1820 returns HuggingFace diffuser pipeline
@@ -31,12 +33,14 @@ def get_inference_pipeline(precision, backend):
3133 torch_dtype = torch .float32 if precision == "single" else torch .float16 ,
3234 )
3335 pipe = pipe .to (device )
34- else :
36+ else :
3537 pipe = StableDiffusionOnnxPipeline .from_pretrained (
3638 "CompVis/stable-diffusion-v1-4" ,
3739 use_auth_token = os .environ ["ACCESS_TOKEN" ],
3840 revision = "onnx" ,
39- provider = "CPUExecutionProvider" if device .type == "cpu" else "CUDAExecutionProvider" ,
41+ provider = "CPUExecutionProvider"
42+ if device .type == "cpu"
43+ else "CUDAExecutionProvider" ,
4044 torch_dtype = torch .float32 if precision == "single" else torch .float16 ,
4145 )
4246
@@ -51,43 +55,59 @@ def null_safety(images, **kwargs):
5155 return pipe
5256
5357
54- def do_inference (pipe , n_samples , precision , num_inference_steps ):
58+ def do_inference (pipe , n_samples , use_autocast , num_inference_steps ):
5559 torch .cuda .empty_cache ()
56- context = autocast if (device .type == "cuda" and precision == 'half' ) else nullcontext
60+ context = (
61+ autocast if (device .type == "cuda" and use_autocast ) else nullcontext
62+ )
5763 with context ("cuda" ):
58- images = pipe (prompt = [prompt ] * n_samples , num_inference_steps = num_inference_steps ).images
64+ images = pipe (
65+ prompt = [prompt ] * n_samples , num_inference_steps = num_inference_steps
66+ ).images
5967
6068 return images
6169
6270
63- def get_inference_time (pipe , n_samples , n_repeats , precision , num_inference_steps ):
71+ def get_inference_time (
72+ pipe , n_samples , n_repeats , use_autocast , num_inference_steps
73+ ):
6474 from torch .utils .benchmark import Timer
75+
6576 timer = Timer (
66- stmt = "do_inference(pipe, n_samples, precision , num_inference_steps)" ,
77+ stmt = "do_inference(pipe, n_samples, use_autocast , num_inference_steps)" ,
6778 setup = "from __main__ import do_inference" ,
68- globals = {"pipe" : pipe , "n_samples" : n_samples , "precision" : precision , "num_inference_steps" : num_inference_steps },
69- num_threads = multiprocessing .cpu_count ()
79+ globals = {
80+ "pipe" : pipe ,
81+ "n_samples" : n_samples ,
82+ "use_autocast" : use_autocast ,
83+ "num_inference_steps" : num_inference_steps ,
84+ },
85+ num_threads = multiprocessing .cpu_count (),
7086 )
7187 profile_result = timer .timeit (
7288 n_repeats
7389 ) # benchmark.Timer performs 2 iterations for warmup
7490 return round (profile_result .mean , 2 )
7591
7692
77- def get_inference_memory (pipe , n_samples , precision , num_inference_steps ):
93+ def get_inference_memory (pipe , n_samples , use_autocast , num_inference_steps ):
7894 if not torch .cuda .is_available ():
7995 return 0
80-
96+
8197 torch .cuda .empty_cache ()
82- context = autocast if (device .type == "cuda" and precision == 'half' ) else nullcontext
98+ context = autocast if (device .type == "cuda" and use_autocast ) else nullcontext
8399 with context ("cuda" ):
84- images = pipe (prompt = [prompt ] * n_samples , num_inference_steps = num_inference_steps ).images
100+ images = pipe (
101+ prompt = [prompt ] * n_samples , num_inference_steps = num_inference_steps
102+ ).images
85103
86104 mem = torch .cuda .memory_reserved ()
87105 return round (mem / 1e9 , 2 )
88106
89107
90- def run_benchmark (n_repeats , n_samples , precision , backend , num_inference_steps ):
108+ def run_benchmark (
109+ n_repeats , n_samples , precision , use_autocast , backend , num_inference_steps
110+ ):
91111 """
92112 * n_repeats: nb datapoints for inference latency benchmark
93113 * n_samples: number of samples to generate (~ batch size)
@@ -100,10 +120,16 @@ def run_benchmark(n_repeats, n_samples, precision, backend, num_inference_steps)
100120 pipe = get_inference_pipeline (precision , backend )
101121
102122 logs = {
103- "memory" : 0.00 if device .type == "cpu" else get_inference_memory (pipe , n_samples , precision , num_inference_steps ),
104- "latency" : get_inference_time (pipe , n_samples , n_repeats , precision , num_inference_steps ),
123+ "memory" : 0.00
124+ if device .type == "cpu"
125+ else get_inference_memory (
126+ pipe , n_samples , use_autocast , num_inference_steps
127+ ),
128+ "latency" : get_inference_time (
129+ pipe , n_samples , n_repeats , use_autocast , num_inference_steps
130+ ),
105131 }
106- print (f"n_samples: { n_samples } \t precision: { precision } \t backend: { backend } " )
132+ print (f"n_samples: { n_samples } \t precision: { precision } \t autocast: { use_autocast } \ t backend: { backend } " )
107133 print (logs , "\n " )
108134 return logs
109135
@@ -115,9 +141,8 @@ def get_device_description():
115141 """
116142 if device .type == "cpu" :
117143 name = subprocess .check_output (
118- "grep -m 1 'model name' /proc/cpuinfo" ,
119- shell = True
120- ).decode ("utf-8" )
144+ "grep -m 1 'model name' /proc/cpuinfo" , shell = True
145+ ).decode ("utf-8" )
121146 name = " " .join (name .split (" " )[2 :]).strip ()
122147 return name
123148 else :
@@ -130,14 +155,23 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
130155 {
131156 "n_samples": (1, 2),
132157 "precision": ("single", "half"),
158+ "autocast" : ("yes", "no")
133159 }
134160 * n_repeats: nb datapoints for inference latency benchmark
135161 """
136162
137163 csv_fpath = pathlib .Path (__file__ ).parent .parent / "benchmark_tmp.csv"
138164 # create benchmark.csv if not exists
139165 if not os .path .isfile (csv_fpath ):
140- header = ["device" , "precision" , "runtime" , "n_samples" , "latency" , "memory" ]
166+ header = [
167+ "device" ,
168+ "precision" ,
169+ "autocast" ,
170+ "runtime" ,
171+ "n_samples" ,
172+ "latency" ,
173+ "memory" ,
174+ ]
141175 with open (csv_fpath , "w" ) as f :
142176 writer = csv .writer (f )
143177 writer .writerow (header )
@@ -148,48 +182,58 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
148182 device_desc = get_device_description ()
149183 for n_samples in grid ["n_samples" ]:
150184 for precision in grid ["precision" ]:
151- for backend in grid ["backend" ]:
152- try :
153- new_log = run_benchmark (
154- n_repeats = n_repeats ,
155- n_samples = n_samples ,
156- precision = precision ,
157- backend = backend ,
158- num_inference_steps = num_inference_steps
159- )
160- except Exception as e :
161- if "CUDA out of memory" in str (e ) or "Failed to allocate memory" in str (e ):
162- print (str (e ))
163- torch .cuda .empty_cache ()
164- new_log = {
165- "latency" : - 1.00 ,
166- "memory" : - 1.00
167- }
168- else :
169- raise e
170-
171- latency = new_log ["latency" ]
172- memory = new_log ["memory" ]
173- new_row = [device_desc , precision , backend , n_samples , latency , memory ]
174- writer .writerow (new_row )
185+ use_autocast = False
186+ if precision == "half" :
187+ for autocast in grid ["autocast" ]:
188+ if autocast == "yes" :
189+ use_autocast = True
190+ for backend in grid ["backend" ]:
191+ try :
192+ new_log = run_benchmark (
193+ n_repeats = n_repeats ,
194+ n_samples = n_samples ,
195+ precision = precision ,
196+ use_autocast = use_autocast ,
197+ backend = backend ,
198+ num_inference_steps = num_inference_steps ,
199+ )
200+ except Exception as e :
201+ if "CUDA out of memory" in str (
202+ e
203+ ) or "Failed to allocate memory" in str (e ):
204+ print (str (e ))
205+ torch .cuda .empty_cache ()
206+ new_log = {"latency" : - 1.00 , "memory" : - 1.00 }
207+ else :
208+ raise e
209+
210+ latency = new_log ["latency" ]
211+ memory = new_log ["memory" ]
212+ new_row = [
213+ device_desc ,
214+ precision ,
215+ autocast ,
216+ backend ,
217+ n_samples ,
218+ latency ,
219+ memory ,
220+ ]
221+ writer .writerow (new_row )
175222
176223
177224if __name__ == "__main__" :
178225
179226 parser = argparse .ArgumentParser ()
180227
181228 parser .add_argument (
182- "--samples" ,
229+ "--samples" ,
183230 default = "1" ,
184- type = str ,
185- help = "Comma sepearated list of batch sizes (number of samples)"
231+ type = str ,
232+ help = "Comma sepearated list of batch sizes (number of samples)" ,
186233 )
187234
188235 parser .add_argument (
189- "--steps" ,
190- default = 50 ,
191- type = int ,
192- help = "Number of diffusion steps."
236+ "--steps" , default = 50 , type = int , help = "Number of diffusion steps."
193237 )
194238
195239 parser .add_argument (
@@ -199,17 +243,25 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
199243 help = "Number of repeats." ,
200244 )
201245
246+ parser .add_argument (
247+ "--autocast" ,
248+ default = "no" ,
249+ type = str ,
250+ help = "If 'yes', will perform additional runs with autocast activated for half precision inferences" ,
251+ )
252+
202253 args = parser .parse_args ()
203254
204255 grid = {
205- "n_samples" : tuple (map (int , args .samples .split ("," ))),
206- # Only use single-precision for cpu because "LayerNormKernelImpl" not implemented for 'Half' on cpu,
256+ "n_samples" : tuple (map (int , args .samples .split ("," ))),
257+ # Only use single-precision for cpu because "LayerNormKernelImpl" not implemented for 'Half' on cpu,
207258 # Remove autocast won't help. Ref:
208259 # https://github.com/CompVis/stable-diffusion/issues/307
209260 "precision" : ("single" ,) if device .type == "cpu" else ("single" , "half" ),
261+ "autocast" : ("no" ,) if args .autocast == "no" else ("yes" , "no" ),
210262 # Only use onnx for cpu, until issues are fixed by upstreams. Ref:
211263 # https://github.com/huggingface/diffusers/issues/489#issuecomment-1261577250
212264 # https://github.com/huggingface/diffusers/pull/440
213- "backend" : ("pytorch" , "onnx" ) if device .type == "cpu" else ("pytorch" ,)
265+ "backend" : ("pytorch" , "onnx" ) if device .type == "cpu" else ("pytorch" ,),
214266 }
215267 run_benchmark_grid (grid , n_repeats = args .repeats , num_inference_steps = args .steps )
0 commit comments