Skip to content

Commit bd37672

Browse files
authored
Add CPU support and update README (#119)
1 parent 2ad6b94 commit bd37672

File tree

4 files changed

+57
-22
lines changed

4 files changed

+57
-22
lines changed

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ For example:
1717
```
1818
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
1919
```
20+
or
21+
```
22+
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
23+
```
2024

2125
Installation instructions vary by platform. Please see the website https://pytorch.org/
2226

experiments/README.md

+17-4
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ These experiments were run on an Amazon p4d.24xlarge instance. See the Product
3737
- 1152 GiB of RAM
3838
- Software
3939

40+
Meanwhile, these experiments (fp32, bf16, compile, SDPA, Triton, NT) can run on CPU platform as well. Experiment results will be shown in the near future.
4041

4142
### Versions
4243

@@ -47,11 +48,17 @@ These experiments were run on an Amazon p4d.24xlarge instance. See the Product
4748
### Installation instructions
4849

4950
```
50-
$ conda create -n nightly20231117py310
51-
$ conda activate nightly20231117py310
51+
$ conda create -n nightlypy310
52+
$ conda activate nightlypy310
5253
$ conda install python=3.10
53-
$ pip install https://download.pytorch.org/whl/nightly/cu121/torch-2.2.0.dev20231117%2Bcu121-cp310-cp310-linux_x86_64.whl
54-
$ pip install https://download.pytorch.org/whl/nightly/cu121/torchvision-0.17.0.dev20231117%2Bcu121-cp310-cp310-linux_x86_64.whl
54+
For GPU,
55+
- $ pip install https://download.pytorch.org/whl/nightly/cu121/torch-2.2.0.dev20231117%2Bcu121-cp310-cp310-linux_x86_64.whl
56+
- $ pip install https://download.pytorch.org/whl/nightly/cu121/torchvision-0.17.0.dev20231117%2Bcu121-cp310-cp310-linux_x86_64.whl
57+
For CPU,
58+
- $ pip install https://download.pytorch.org/whl/nightly/cpu/torch-2.4.0.dev20240509%2Bcpu-cp310-cp310-linux_x86_64.whl
59+
- $ pip install https://download.pytorch.org/whl/nightly/cpu/torchvision-0.19.0.dev20240509%2Bcpu-cp310-cp310-linux_x86_64.whl
60+
- $ pip install triton
61+
5562
$ git clone https://github.com/cpuhrsch/segment-anything.git
5663
$ cd segment-anything
5764
$ pip install -e .
@@ -66,10 +73,16 @@ If you plan to run the scripts that run the experiments from segment-anything-fa
6673

6774
### How to run experiments
6875

76+
For GPU platform,
6977
```
7078
$ python run_experiments.py 16 vit_b <pytorch_github> <segment-anything_github> <path_to_experiments_data> --run-experiments --num-workers 32
7179
```
7280

81+
For CPU platform, set SEGMENT_ANYTHING_FAST_USE_FLASH_4 as 0, since Custom flash attention kernels were written specifically for A100.
82+
```
83+
$ SEGMENT_ANYTHING_FAST_USE_FLASH_4=0 python run_experiments.py 16 vit_b <pytorch_github> <segment-anything_github> <path_to_experiments_data> --run-experiments --num-workers 32 --device cpu
84+
```
85+
7386
If at any point you run into issue, please note that you can increase verbosity by adding `--capture_output False` to above command. Also, please don't hesitate to open an issue.
7487

7588

experiments/eval_combo.py

+29-15
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from data import build_data, setup_coco_img_ids
66
import math
77
import segment_anything_fast
8+
import time
89

910
torch._dynamo.config.cache_size_limit = 50000
1011

@@ -64,10 +65,13 @@ def build_results_batch_nested(predictor, batch, batch_size, pad_input_image_bat
6465
# We explicitly exclude data transfers from the timing to focus
6566
# only on the kernel performance.
6667
# Next we synchronize and set two events to start timing.
67-
torch.cuda.synchronize()
68-
start_event = torch.cuda.Event(enable_timing=True)
69-
end_event = torch.cuda.Event(enable_timing=True)
70-
start_event.record()
68+
if torch.cuda.is_available():
69+
torch.cuda.synchronize()
70+
start_event = torch.cuda.Event(enable_timing=True)
71+
end_event = torch.cuda.Event(enable_timing=True)
72+
start_event.record()
73+
else:
74+
t0 = time.time()
7175

7276
with torch.autograd.profiler.record_function("timed region"):
7377
with torch.autograd.profiler.record_function("image encoder"):
@@ -93,9 +97,12 @@ def build_results_batch_nested(predictor, batch, batch_size, pad_input_image_bat
9397
# the amount of time spent on the GPU. This is a fairly tight measurement
9498
# around the launched GPU kernels and excludes data movement from host
9599
# to device.
96-
end_event.record()
97-
torch.cuda.synchronize()
98-
elapsed_time = start_event.elapsed_time(end_event)
100+
if torch.cuda.is_available():
101+
end_event.record()
102+
torch.cuda.synchronize()
103+
elapsed_time = start_event.elapsed_time(end_event)
104+
else:
105+
elapsed_time = time.time() - t0
99106
return sum(result_batch, []), orig_input_image_batch_size, elapsed_time
100107

101108
def build_results_batch(predictor, batch, batch_size, pad_input_image_batch):
@@ -123,10 +130,13 @@ def build_results_batch(predictor, batch, batch_size, pad_input_image_batch):
123130
# We explicitly exclude data transfers from the timing to focus
124131
# only on the kernel performance.
125132
# Next we synchronize and set two events to start timing.
126-
torch.cuda.synchronize()
127-
start_event = torch.cuda.Event(enable_timing=True)
128-
end_event = torch.cuda.Event(enable_timing=True)
129-
start_event.record()
133+
if torch.cuda.is_available():
134+
torch.cuda.synchronize()
135+
start_event = torch.cuda.Event(enable_timing=True)
136+
end_event = torch.cuda.Event(enable_timing=True)
137+
start_event.record()
138+
else:
139+
t0 = time.time()
130140

131141
with torch.autograd.profiler.record_function("timed region"):
132142
with torch.autograd.profiler.record_function("image encoder"):
@@ -157,9 +167,12 @@ def build_results_batch(predictor, batch, batch_size, pad_input_image_batch):
157167
# the amount of time spent on the GPU. This is a fairly tight measurement
158168
# around the launched GPU kernels and excludes data movement from host
159169
# to device.
160-
end_event.record()
161-
torch.cuda.synchronize()
162-
elapsed_time = start_event.elapsed_time(end_event)
170+
if torch.cuda.is_available():
171+
end_event.record()
172+
torch.cuda.synchronize()
173+
elapsed_time = start_event.elapsed_time(end_event)
174+
else:
175+
elapsed_time = time.time() - t0
163176
return result_batch, orig_input_image_batch_size, elapsed_time
164177

165178

@@ -290,6 +303,7 @@ def run(
290303
memory_path=None,
291304
use_local_sam_fork=False,
292305
use_compiler_settings=False,
306+
device="cuda"
293307
):
294308
from torch._inductor import config as inductorconfig
295309
inductorconfig.triton.unique_kernel_names = True
@@ -327,7 +341,7 @@ def run(
327341
else:
328342
from segment_anything import sam_model_registry, SamPredictor
329343
checkpoint_path = model_type_to_checkpoint[sam_model_type]
330-
sam = sam_model_registry[sam_model_type](checkpoint=checkpoint_path).cuda()
344+
sam = sam_model_registry[sam_model_type](checkpoint=checkpoint_path).to(torch.device(device))
331345
predictor = SamPredictor(sam)
332346

333347
from segment_anything_fast import tools

experiments/run_experiments.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ def run_experiment(experiments_data,
4545
limit=None,
4646
profile_path=None,
4747
profile_top=False,
48-
memory_path=None):
48+
memory_path=None,
49+
device="cuda"):
4950
root_cmd = ["python", "eval_combo.py",
5051
"--coco_root_dir",
5152
f"{experiments_data}/datasets/coco2017",
@@ -84,6 +85,7 @@ def run_experiment(experiments_data,
8485
args = args + ["--memory-path", memory_path]
8586
if extra_args is None:
8687
extra_args = []
88+
args = args + ["--device", device]
8789
args = args + extra_args
8890
if print_header:
8991
args = args + ["--print_header", "True"]
@@ -145,7 +147,8 @@ def run(batch_size,
145147
num_workers=32,
146148
print_header=True,
147149
capture_output=True,
148-
local_fork_only=False):
150+
local_fork_only=False,
151+
device="cuda"):
149152

150153
assert model == "vit_b" or model == "vit_h"
151154

@@ -155,7 +158,8 @@ def run(batch_size,
155158
model,
156159
batch_size=batch_size,
157160
num_workers=num_workers,
158-
capture_output=capture_output)
161+
capture_output=capture_output,
162+
device=device)
159163

160164
print_header = True
161165
if run_traces:

0 commit comments

Comments
 (0)