diff --git a/torchao/utils.py b/torchao/utils.py
new file mode 100644
index 0000000000..c414843da1
--- /dev/null
+++ b/torchao/utils.py
@@ -0,0 +1,26 @@
+import torch
+
+
+def benchmark_model(model, num_runs, input_tensor):
+    torch.cuda.synchronize()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    
+    # benchmark
+    for _ in range(num_runs):
+        with torch.autograd.profiler.record_function("timed region"):
+            model(input_tensor)
+    
+    end_event.record()
+    torch.cuda.synchronize()
+    return start_event.elapsed_time(end_event) / num_runs
+
+def profiler_runner(path, fn, *args, **kwargs):
+    with torch.profiler.profile(
+            activities=[torch.profiler.ProfilerActivity.CPU,
+                        torch.profiler.ProfilerActivity.CUDA],
+            record_shapes=True) as prof:
+        result = fn(*args, **kwargs)
+    prof.export_chrome_trace(path)
+    return result
diff --git a/tutorials/quantize_vit/run_vit_b.py b/tutorials/quantize_vit/run_vit_b.py
index ab19f7ba28..a7fd78f9b2 100644
--- a/tutorials/quantize_vit/run_vit_b.py
+++ b/tutorials/quantize_vit/run_vit_b.py
@@ -1,6 +1,8 @@
 import torch
 import torchvision.models.vision_transformer as models
 
+from torchao.utils import benchmark_model, profiler_runner
+torch.set_float32_matmul_precision("high")
 # Load Vision Transformer model
 model = models.vit_b_16(pretrained=True)
 
@@ -12,30 +14,6 @@
 
 model = torch.compile(model, mode='max-autotune')
 
-def benchmark_model(model, num_runs, input_tensor):
-    torch.cuda.synchronize()
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-    start_event.record()
-    
-    # benchmark
-    for _ in range(num_runs):
-        with torch.autograd.profiler.record_function("timed region"):
-            model(input_tensor)
-    
-    end_event.record()
-    torch.cuda.synchronize()
-    return start_event.elapsed_time(end_event) / num_runs
-
-def profiler_runner(path, fn, *args, **kwargs):
-    with torch.profiler.profile(
-            activities=[torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA],
-            record_shapes=True) as prof:
-        result = fn(*args, **kwargs)
-    prof.export_chrome_trace(path)
-    return result
-
 # Must run with no_grad when optimizing for inference
 with torch.no_grad():
     # warmup
diff --git a/tutorials/quantize_vit/run_vit_b_quant.py b/tutorials/quantize_vit/run_vit_b_quant.py
index c329c28d0c..0396a9dffd 100644
--- a/tutorials/quantize_vit/run_vit_b_quant.py
+++ b/tutorials/quantize_vit/run_vit_b_quant.py
@@ -2,6 +2,8 @@
 import torchao
 import torchvision.models.vision_transformer as models
 
+from torchao.utils import benchmark_model, profiler_runner
+torch.set_float32_matmul_precision("high")
 # Load Vision Transformer model
 model = models.vit_b_16(pretrained=True)
 
@@ -19,30 +21,6 @@
 
 model = torch.compile(model, mode='max-autotune')
 
-def benchmark_model(model, num_runs, input_tensor):
-    torch.cuda.synchronize()
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-    start_event.record()
-    
-    # benchmark
-    for _ in range(num_runs):
-        with torch.autograd.profiler.record_function("timed region"):
-            model(input_tensor)
-    
-    end_event.record()
-    torch.cuda.synchronize()
-    return start_event.elapsed_time(end_event) / num_runs
-
-def profiler_runner(path, fn, *args, **kwargs):
-    with torch.profiler.profile(
-            activities=[torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA],
-            record_shapes=True) as prof:
-        result = fn(*args, **kwargs)
-    prof.export_chrome_trace(path)
-    return result
-
 # Must run with no_grad when optimizing for inference
 with torch.no_grad():
     # warmup