forked from karpathy/llm.c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprofile_gpt2cu.py
196 lines (170 loc) · 6.86 KB
/
profile_gpt2cu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# runs profiling with ncu, generates a `profile.ncu-rep` for viewing with NSight Compute, and prints out
# basic kernel stats.
# Note: If you run into errors because of missing access rights to performance counters, try
# https://developer.nvidia.com/nvidia-development-tools-solutions-err_nvgpuctrperm-permission-issue-performance-counters#SolnAdminTag
import subprocess
import csv
from collections import defaultdict
import shutil
# find ncu: Is it on PATH?
NCU = shutil.which("ncu")
# otherwise, guess a standard location
if NCU is None:
NCU = "/usr/local/cuda/bin/ncu"
# build the executable
subprocess.check_call(["make", "profile_gpt2cu", "NO_MULTI_GPU=1", "USE_CUDNN=1"])
# try to see if profiling is allowed for non-root:
options = subprocess.check_output(["modprobe", "-c", "nvidia"], text=True)
can_profile = len([l for l in options.splitlines() if "NVreg_RestrictProfilingToAdminUsers=0" in l]) != 0
# record metrics
# --full and --import-source are entirely superfluous for this script, but you might want to
# manually inspect `profile.ncu-rep`, so we keep it here
cmd = [NCU, "--set", "full", "--import-source", "yes", "-o", "profile", "-f", "./profile_gpt2cu"]
# do we need to run under sudo
if not can_profile:
print("NVreg_RestrictProfilingToAdminUsers=1, running with sudo")
cmd = ["sudo"] + cmd
subprocess.check_call(cmd)
# generate csv
# https://forums.developer.nvidia.com/t/converting-nsys-rep-file-into-a-csv-file-with-formatting-like-the-summary-page-in-ncu-gui/231717/3
metrics = [
"gpu__time_duration.sum", # total time
"dram__bytes_read.sum", # DRAM reads
"dram__bytes_write.sum", # DRAM writes
"lts__t_sectors_srcunit_tex_op_read.sum", # L2 reads (sectors -- 32B)
"lts__t_sectors_srcunit_tex_op_write.sum", # L2 reads (sectors -- 32B)
"smsp__inst_executed.sum", # instructions
]
cmd = [NCU, "-i", "profile.ncu-rep", "--csv", "--page", "raw", "--metrics", ",".join(metrics)]
result = subprocess.check_output(cmd, text=True).strip()
reader = csv.reader(result.splitlines(keepends=True))
# model config
CLS_START = -1
CLS_NUM = 6
N_LAYERS = 12
summaries = defaultdict(lambda: 0.0)
counts = defaultdict(lambda: 0)
passes = defaultdict(lambda: 0.0)
total = defaultdict(lambda: 0.0)
no_cutlass = 0.0
CC = ""
phase = "fwd"
kernel_profile_data = list(enumerate(reader))
for rid, row in kernel_profile_data:
if rid <= 2:
continue
kernel = row[4]
kid = rid - 2
if "fused_classifier" in kernel:
# classifier: layernorm -> matmul -> fused -> bw matmul (x2) -> bw layernorm
CLS_START = kid - 2
assert CLS_START != -1
print()
print("Kernel calls:")
for rid, row in kernel_profile_data:
if rid == 0:
# headings
print(f"id pass {'name':<40} {'time':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}")
continue
if rid == 1:
# units
units = f" {'':<40} {'ms':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}"
print(units)
print("." * len(units))
continue
if rid == 2:
CC = row[10]
# actual data
kernel = row[4]
time = float(row[13])
read = float(row[11])
write = float(row[12])
l2_read = float(row[14])
l2_write = float(row[15])
inst = float(row[16]) / 1e6
kid = rid - 2
multiplier = 1
if "encoder" in kernel:
pass_name = "enc"
if phase == "bwd":
phase = "bwd-enc"
elif CLS_START <= kid < CLS_START + CLS_NUM:
# the classifier part, counts only once
pass_name = "cls"
phase = "bwd"
elif "adamw" in kernel:
# encoder layer or adam
pass_name = "opt"
# before the first optimizer run, we create weight copies.
# they aren't part of regular processing, so they get a multiplier
# of zero
elif phase == "bwd-enc":
pass_name = "init"
multiplier = 0
else:
pass_name = phase
multiplier = N_LAYERS
time *= N_LAYERS
read *= N_LAYERS
write *= N_LAYERS
l2_read *= N_LAYERS
l2_write *= N_LAYERS
inst *= N_LAYERS
# split at "(" -- argument list
fn_name = kernel.split("(")[0]
# some names include the return value, others don't?
if " " in fn_name:
fn_name = fn_name.split(" ")[1]
if "<" in fn_name:
fn_name = fn_name.split("<")[0]
# group together matmul kernels
if "cutlass" in fn_name:
pass
elif fn_name.startswith("ampere_bf16"):
fn_name = "ampere_bf16"
elif fn_name.startswith("cudnn_generated_fort_native_sdpa"):
fn_name = "cudnn_generated_fort_native_sdpa"
else:
no_cutlass += time
# convert L2 to GiB
l2_read = l2_read * 32 / 1024 / 1024 / 1024
l2_write = l2_write * 32 / 1024 / 1024 / 1024
summaries[fn_name] += time
counts[fn_name] += multiplier
passes[pass_name] += time
if pass_name != "init":
total['time'] += time
total['read'] += read
total['write'] += write
total['l2_read'] += l2_read
total['l2_write'] += l2_write
total['inst'] += inst
pass_info = f"{pass_name}×{multiplier}"
print(f"{kid:02} {pass_info:7} {fn_name:<40} {time:8.2f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}")
total_time = total['time']
print("." * len(units))
print(f" {'Total':<40} {total['time']:8.2f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}")
print()
print("Kernel type summaries:")
print(f" {'name':<40} {'time':>6} {'frac':>6} {'count':>6}")
ordered_time = sorted(summaries.items(), key=lambda x: x[1], reverse=True)
for entry, value in ordered_time:
# crop entry to be at most 40 characters
if len(entry) > 40:
entry_text = entry[:37] + "..."
else:
entry_text = entry
print(f" {entry_text:<40} {value:6.2f} {100*value / total_time:6.2f}% {counts[entry]:>6d}")
ts = total_time / 1000
summary = f"""
In total, a training step takes {total_time:.1f}ms, distributed as:
{passes['enc']:.1f}ms ({100 * passes['enc'] / total_time:.1f}%) in the encoder,
{passes['fwd']:.1f}ms ({100 * passes['fwd'] / total_time:.1f}%) in forward blocks,
{passes['cls']:.1f}ms ({100 * passes['cls'] / total_time:.1f}%) in the classifier part,
{passes['bwd']:.1f}ms ({100 * passes['bwd'] / total_time:.1f}%) in backward blocks, and
{passes['opt']:.1f}ms ({100 * passes['opt'] / total_time:.1f}%) in the optimizer.
We read {total['read']:.1f}GiB ({total['read']/ts:.1f}GB/s) and write {total['write']:.1f}GiB ({total['write']/ts:.1f}GB/s) to DRAM,
read {total['l2_read']:.1f}GiB ({total['l2_read']/ts:.1f}GB/s) and write {total['l2_write']:.1f}GiB ({total['l2_write']/ts:.1f}GB/s) to L2,
and execute {total['inst'] / 1000:.1f} billion instructions ({total['inst'] / 1000 / ts:.1f} GInst/s).
"""
print(summary)