Skip to content

Commit

Permalink
update profiling script
Browse files Browse the repository at this point in the history
  • Loading branch information
xrsrke committed Nov 1, 2024
1 parent dda00a4 commit b4156dc
Showing 1 changed file with 117 additions and 80 deletions.
197 changes: 117 additions & 80 deletions benchmark/fp8_tp.py
Original file line number Diff line number Diff line change
@@ -1,112 +1,149 @@
# from nanotron import distributed as dist
import argparse
import itertools

# import torch.distributed as dist
import pandas as pd
import torch
import torch.distributed as dist
from nanotron.parallel import ParallelContext
from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
from nanotron.parallel.tensor_parallel.nn import (
FP8TensorParallelColumnLinear,
)
from nanotron.parallel.tensor_parallel.nn import FP8TensorParallelColumnLinear
from torch.profiler import ProfilerActivity

if __name__ == "__main__":
# NOTE: divisible by 16 for TP
in_features = 4096
out_features = 4096 * 4

parallel_context = ParallelContext(data_parallel_size=1, pipeline_parallel_size=1, tensor_parallel_size=2)
# out_features_per_tp_rank = 16
# out_features = parallel_context.tp_pg.size() * out_features_per_tp_rank

batch_size = 128
seq_len = 8192
merged_gbs = batch_size * seq_len
sharded_random_input = torch.randn(batch_size, in_features, device="cuda", requires_grad=True)

def run_experiment(exp_name, M, N, K, TP_SIZE, parallel_context):
torch.cuda.synchronize()
input = torch.randn(M, K, device="cuda", requires_grad=True)
column_linear = FP8TensorParallelColumnLinear(
in_features=in_features,
out_features=out_features,
in_features=K,
out_features=N,
pg=parallel_context.tp_pg,
mode=TensorParallelLinearMode.ALL_REDUCE,
device="cuda",
async_communication=False,
bias=False,
)

def trace_handler(p):
output = p.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_memory_usage", row_limit=20)
print(output)
p.export_chrome_trace("./trace_" + str(p.step_num) + ".json")

with torch.profiler.profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
# schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler(
"./log/exp900a01_fp8_tp2_and_mbs_16",
# use_gzip=True
),
on_trace_ready=torch.profiler.tensorboard_trace_handler(f"./log/{exp_name}"),
record_shapes=True,
profile_memory=True,
with_stack=True,
# with_flops=True,
with_modules=True,
# on_trace_ready=trace_handler,
experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True),
use_cuda=True,
) as prof:
prof.step()
sharded_output = column_linear(sharded_random_input)
sharded_output = column_linear(input)
sharded_output.sum().backward()


# if dist.get_rank() == 0:
# print("sharded_output.dtype: ", sharded_output.dtype)
# print(prof.key_averages(group_by_stack_n=5).table(sort_by="cuda_time_total", row_limit=20, top_level_events_only=False))

# # Print detailed events with stack traces
# for event in prof.events():
# if 'aten::copy_' in event.name:
# print(f"\nOperation: {event.name}")
# print(f"CUDA time: {event.cuda_time_total/1000:.2f}ms")
# if event.stack: # Print full stack trace
# print("Stack trace:")
# try:
# # Print the raw stack information
# print("Raw stack info:", event.stack)

# # Safely iterate through stack frames
# for frame in event.stack:
# if isinstance(frame, (list, tuple)):
# print(" ", " - ".join(str(x) for x in frame))
# else:
# print(" ", frame)
# except Exception as e:
# print(f"Error printing stack: {e}")
# print("Raw stack data:", repr(event.stack))
# print("-" * 80)


print(
prof.key_averages(group_by_stack_n=1000).table(
sort_by="self_cuda_time_total",
row_limit=20,
max_src_column_width=100, # Increase source column width
top_level_events_only=False,
# max_name_column_width=1000,
max_name_column_width=120,
torch.cuda.synchronize()
return prof


def print_profiling_table(prof, sort_by="cpu_time_total"):
print(f"###### sorted by {sort_by} ######")
print(
prof.key_averages(group_by_stack_n=100).table(
sort_by=sort_by,
row_limit=20,
max_src_column_width=2000, # Increase source column width
top_level_events_only=False,
max_name_column_width=2000,
max_shapes_column_width=1000,
)
)
)

# print(prof.key_averages(group_by_stack_n=5).table(
# sort_by="self_cuda_time_total",
# row_limit=20,
# max_src_column_width=100, # Increase source column width
# top_level_events_only=False
# ))

# prof.export_stacks("_x", "self_cuda_time_total")
def explore_event_values(event):
for attr in dir(event):
if not attr.startswith("_"): # Skip internal attributes
try:
value = getattr(event, attr)
if callable(value): # Skip methods
continue
print(f"\n{attr}:")
print(value)
print("-" * 50) # Separator for better readability
except Exception:
print(f"{attr}: <error accessing attribute>")


def parse_args():
parser = argparse.ArgumentParser(description="Run profiling experiments with configurable dimensions")
parser.add_argument("--exp_number", type=str, help="Experiment number")
parser.add_argument("--tp_size", type=int, default=1, help="Tensor Parallel size")
parser.add_argument(
"--dimensions",
type=str,
default="1024,2048,4096,8192,16384,32768",
help="Comma-separated list of dimensions to test",
)
return parser.parse_args()

# for event in prof.events():
# print(event.name, event.stack)

# prof.export_chrome_trace("trace.json")
if __name__ == "__main__":
torch.backends.cudnn.benchmark = True

args = parse_args()

# Parse dimensions from comma-separated string to list of integers
dimensions = [int(d.strip()) for d in args.dimensions.split(",")]
TP_SIZE = args.tp_size
EXP_NUMBER = args.exp_number

# dimensions = [1024, 2048, 4096, 8192, 16384]
# TP_SIZE = 8

results = []
total = len(list(itertools.product(dimensions, dimensions, dimensions)))
experiment_count = 0
parallel_context = ParallelContext(data_parallel_size=1, pipeline_parallel_size=1, tensor_parallel_size=TP_SIZE)

for M, N, K in itertools.product(dimensions, dimensions, dimensions):
exp_name = f"{EXP_NUMBER}_fp8_m{M}_n{N}_k{K}_and_tp{TP_SIZE}"
total += 1
print(f"Running experiment with M={M}, N={N}, K={K}, {experiment_count}/{total}")

prof = run_experiment(exp_name, M, N, K, TP_SIZE=TP_SIZE, parallel_context=parallel_context)

if dist.get_rank() == 0:
print_profiling_table(prof, sort_by="cpu_time_total")
print_profiling_table(prof, sort_by="cuda_time_total")
# explore_event_values(table)

# Get top 5 operations by CPU time
# sorted_events = prof.key_averages().table(sort_by="cpu_time_total")

# NOTE: loop through all events and sum up the total time, then calculate the percent
averages = prof.key_averages(group_by_stack_n=100)
# NOTE: why sum .self_cpu_time_total instead of .cpu_time_total?
# source: https://github.com/pytorch/pytorch/blob/f14f245747db2f80e963bd72561f5bd5ed216a4a/torch/autograd/profiler_util.py#L976-L990
# i test and it matches the torch's generated table
cpu_time_total_of_all_events = sum([event.self_cpu_time_total for event in averages])
sorted_events = sorted(averages, key=lambda x: x.cpu_time_total, reverse=True)[:5]

for event in sorted_events:
event_cpu_time_percent = (event.cpu_time_total / cpu_time_total_of_all_events) * 100

results.append(
{
"M": M,
"N": N,
"K": K,
"Operation": event.key,
"CPU Time (ms)": event.cpu_time_total / 1000, # Convert to milliseconds
"CPU Time %": f"{event_cpu_time_percent:.2f}%",
"CUDA Time (ms)": event.cuda_time_total / 1000, # Convert to milliseconds
# 'Memory Used (MB)': event.cpu_memory_usage / (1024 * 1024) if event.cpu_memory_usage else 0
}
)

if dist.get_rank() == 0:
df = pd.DataFrame(results)
print("\nTop 5 most time-consuming operations for each dimension combination:")
print(df.to_string())
df.to_csv(
f'{EXP_NUMBER}_profiling_results_with_m_n_k_with_cartesian_product_{"_".join(map(str, dimensions))}.csv',
index=False,
)

0 comments on commit b4156dc

Please sign in to comment.