-
Notifications
You must be signed in to change notification settings - Fork 122
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
117 additions
and
80 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,112 +1,149 @@ | ||
# from nanotron import distributed as dist | ||
import argparse | ||
import itertools | ||
|
||
# import torch.distributed as dist | ||
import pandas as pd | ||
import torch | ||
import torch.distributed as dist | ||
from nanotron.parallel import ParallelContext | ||
from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode | ||
from nanotron.parallel.tensor_parallel.nn import ( | ||
FP8TensorParallelColumnLinear, | ||
) | ||
from nanotron.parallel.tensor_parallel.nn import FP8TensorParallelColumnLinear | ||
from torch.profiler import ProfilerActivity | ||
|
||
if __name__ == "__main__": | ||
# NOTE: divisible by 16 for TP | ||
in_features = 4096 | ||
out_features = 4096 * 4 | ||
|
||
parallel_context = ParallelContext(data_parallel_size=1, pipeline_parallel_size=1, tensor_parallel_size=2) | ||
# out_features_per_tp_rank = 16 | ||
# out_features = parallel_context.tp_pg.size() * out_features_per_tp_rank | ||
|
||
batch_size = 128 | ||
seq_len = 8192 | ||
merged_gbs = batch_size * seq_len | ||
sharded_random_input = torch.randn(batch_size, in_features, device="cuda", requires_grad=True) | ||
|
||
def run_experiment(exp_name, M, N, K, TP_SIZE, parallel_context): | ||
torch.cuda.synchronize() | ||
input = torch.randn(M, K, device="cuda", requires_grad=True) | ||
column_linear = FP8TensorParallelColumnLinear( | ||
in_features=in_features, | ||
out_features=out_features, | ||
in_features=K, | ||
out_features=N, | ||
pg=parallel_context.tp_pg, | ||
mode=TensorParallelLinearMode.ALL_REDUCE, | ||
device="cuda", | ||
async_communication=False, | ||
bias=False, | ||
) | ||
|
||
def trace_handler(p): | ||
output = p.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_memory_usage", row_limit=20) | ||
print(output) | ||
p.export_chrome_trace("./trace_" + str(p.step_num) + ".json") | ||
|
||
with torch.profiler.profile( | ||
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], | ||
# schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1), | ||
on_trace_ready=torch.profiler.tensorboard_trace_handler( | ||
"./log/exp900a01_fp8_tp2_and_mbs_16", | ||
# use_gzip=True | ||
), | ||
on_trace_ready=torch.profiler.tensorboard_trace_handler(f"./log/{exp_name}"), | ||
record_shapes=True, | ||
profile_memory=True, | ||
with_stack=True, | ||
# with_flops=True, | ||
with_modules=True, | ||
# on_trace_ready=trace_handler, | ||
experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True), | ||
use_cuda=True, | ||
) as prof: | ||
prof.step() | ||
sharded_output = column_linear(sharded_random_input) | ||
sharded_output = column_linear(input) | ||
sharded_output.sum().backward() | ||
|
||
|
||
# if dist.get_rank() == 0: | ||
# print("sharded_output.dtype: ", sharded_output.dtype) | ||
# print(prof.key_averages(group_by_stack_n=5).table(sort_by="cuda_time_total", row_limit=20, top_level_events_only=False)) | ||
|
||
# # Print detailed events with stack traces | ||
# for event in prof.events(): | ||
# if 'aten::copy_' in event.name: | ||
# print(f"\nOperation: {event.name}") | ||
# print(f"CUDA time: {event.cuda_time_total/1000:.2f}ms") | ||
# if event.stack: # Print full stack trace | ||
# print("Stack trace:") | ||
# try: | ||
# # Print the raw stack information | ||
# print("Raw stack info:", event.stack) | ||
|
||
# # Safely iterate through stack frames | ||
# for frame in event.stack: | ||
# if isinstance(frame, (list, tuple)): | ||
# print(" ", " - ".join(str(x) for x in frame)) | ||
# else: | ||
# print(" ", frame) | ||
# except Exception as e: | ||
# print(f"Error printing stack: {e}") | ||
# print("Raw stack data:", repr(event.stack)) | ||
# print("-" * 80) | ||
|
||
|
||
print( | ||
prof.key_averages(group_by_stack_n=1000).table( | ||
sort_by="self_cuda_time_total", | ||
row_limit=20, | ||
max_src_column_width=100, # Increase source column width | ||
top_level_events_only=False, | ||
# max_name_column_width=1000, | ||
max_name_column_width=120, | ||
torch.cuda.synchronize() | ||
return prof | ||
|
||
|
||
def print_profiling_table(prof, sort_by="cpu_time_total"): | ||
print(f"###### sorted by {sort_by} ######") | ||
print( | ||
prof.key_averages(group_by_stack_n=100).table( | ||
sort_by=sort_by, | ||
row_limit=20, | ||
max_src_column_width=2000, # Increase source column width | ||
top_level_events_only=False, | ||
max_name_column_width=2000, | ||
max_shapes_column_width=1000, | ||
) | ||
) | ||
) | ||
|
||
# print(prof.key_averages(group_by_stack_n=5).table( | ||
# sort_by="self_cuda_time_total", | ||
# row_limit=20, | ||
# max_src_column_width=100, # Increase source column width | ||
# top_level_events_only=False | ||
# )) | ||
|
||
# prof.export_stacks("_x", "self_cuda_time_total") | ||
def explore_event_values(event): | ||
for attr in dir(event): | ||
if not attr.startswith("_"): # Skip internal attributes | ||
try: | ||
value = getattr(event, attr) | ||
if callable(value): # Skip methods | ||
continue | ||
print(f"\n{attr}:") | ||
print(value) | ||
print("-" * 50) # Separator for better readability | ||
except Exception: | ||
print(f"{attr}: <error accessing attribute>") | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser(description="Run profiling experiments with configurable dimensions") | ||
parser.add_argument("--exp_number", type=str, help="Experiment number") | ||
parser.add_argument("--tp_size", type=int, default=1, help="Tensor Parallel size") | ||
parser.add_argument( | ||
"--dimensions", | ||
type=str, | ||
default="1024,2048,4096,8192,16384,32768", | ||
help="Comma-separated list of dimensions to test", | ||
) | ||
return parser.parse_args() | ||
|
||
# for event in prof.events(): | ||
# print(event.name, event.stack) | ||
|
||
# prof.export_chrome_trace("trace.json") | ||
if __name__ == "__main__": | ||
torch.backends.cudnn.benchmark = True | ||
|
||
args = parse_args() | ||
|
||
# Parse dimensions from comma-separated string to list of integers | ||
dimensions = [int(d.strip()) for d in args.dimensions.split(",")] | ||
TP_SIZE = args.tp_size | ||
EXP_NUMBER = args.exp_number | ||
|
||
# dimensions = [1024, 2048, 4096, 8192, 16384] | ||
# TP_SIZE = 8 | ||
|
||
results = [] | ||
total = len(list(itertools.product(dimensions, dimensions, dimensions))) | ||
experiment_count = 0 | ||
parallel_context = ParallelContext(data_parallel_size=1, pipeline_parallel_size=1, tensor_parallel_size=TP_SIZE) | ||
|
||
for M, N, K in itertools.product(dimensions, dimensions, dimensions): | ||
exp_name = f"{EXP_NUMBER}_fp8_m{M}_n{N}_k{K}_and_tp{TP_SIZE}" | ||
total += 1 | ||
print(f"Running experiment with M={M}, N={N}, K={K}, {experiment_count}/{total}") | ||
|
||
prof = run_experiment(exp_name, M, N, K, TP_SIZE=TP_SIZE, parallel_context=parallel_context) | ||
|
||
if dist.get_rank() == 0: | ||
print_profiling_table(prof, sort_by="cpu_time_total") | ||
print_profiling_table(prof, sort_by="cuda_time_total") | ||
# explore_event_values(table) | ||
|
||
# Get top 5 operations by CPU time | ||
# sorted_events = prof.key_averages().table(sort_by="cpu_time_total") | ||
|
||
# NOTE: loop through all events and sum up the total time, then calculate the percent | ||
averages = prof.key_averages(group_by_stack_n=100) | ||
# NOTE: why sum .self_cpu_time_total instead of .cpu_time_total? | ||
# source: https://github.com/pytorch/pytorch/blob/f14f245747db2f80e963bd72561f5bd5ed216a4a/torch/autograd/profiler_util.py#L976-L990 | ||
# i test and it matches the torch's generated table | ||
cpu_time_total_of_all_events = sum([event.self_cpu_time_total for event in averages]) | ||
sorted_events = sorted(averages, key=lambda x: x.cpu_time_total, reverse=True)[:5] | ||
|
||
for event in sorted_events: | ||
event_cpu_time_percent = (event.cpu_time_total / cpu_time_total_of_all_events) * 100 | ||
|
||
results.append( | ||
{ | ||
"M": M, | ||
"N": N, | ||
"K": K, | ||
"Operation": event.key, | ||
"CPU Time (ms)": event.cpu_time_total / 1000, # Convert to milliseconds | ||
"CPU Time %": f"{event_cpu_time_percent:.2f}%", | ||
"CUDA Time (ms)": event.cuda_time_total / 1000, # Convert to milliseconds | ||
# 'Memory Used (MB)': event.cpu_memory_usage / (1024 * 1024) if event.cpu_memory_usage else 0 | ||
} | ||
) | ||
|
||
if dist.get_rank() == 0: | ||
df = pd.DataFrame(results) | ||
print("\nTop 5 most time-consuming operations for each dimension combination:") | ||
print(df.to_string()) | ||
df.to_csv( | ||
f'{EXP_NUMBER}_profiling_results_with_m_n_k_with_cartesian_product_{"_".join(map(str, dimensions))}.csv', | ||
index=False, | ||
) |