diff --git a/benchmark/fp8_tp.py b/benchmark/fp8_tp.py
index 23e47787..8a5459b0 100644
--- a/benchmark/fp8_tp.py
+++ b/benchmark/fp8_tp.py
@@ -1,31 +1,21 @@
-# from nanotron import distributed as dist
+import argparse
+import itertools
 
-# import torch.distributed as dist
+import pandas as pd
 import torch
+import torch.distributed as dist
 from nanotron.parallel import ParallelContext
 from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
-from nanotron.parallel.tensor_parallel.nn import (
-    FP8TensorParallelColumnLinear,
-)
+from nanotron.parallel.tensor_parallel.nn import FP8TensorParallelColumnLinear
 from torch.profiler import ProfilerActivity
 
-if __name__ == "__main__":
-    # NOTE: divisible by 16 for TP
-    in_features = 4096
-    out_features = 4096 * 4
-
-    parallel_context = ParallelContext(data_parallel_size=1, pipeline_parallel_size=1, tensor_parallel_size=2)
-    # out_features_per_tp_rank = 16
-    # out_features = parallel_context.tp_pg.size() * out_features_per_tp_rank
-
-    batch_size = 128
-    seq_len = 8192
-    merged_gbs = batch_size * seq_len
-    sharded_random_input = torch.randn(batch_size, in_features, device="cuda", requires_grad=True)
 
+def run_experiment(exp_name, M, N, K, TP_SIZE, parallel_context):
+    torch.cuda.synchronize()
+    input = torch.randn(M, K, device="cuda", requires_grad=True)
     column_linear = FP8TensorParallelColumnLinear(
-        in_features=in_features,
-        out_features=out_features,
+        in_features=K,
+        out_features=N,
         pg=parallel_context.tp_pg,
         mode=TensorParallelLinearMode.ALL_REDUCE,
         device="cuda",
@@ -33,80 +23,127 @@
         bias=False,
     )
 
-    def trace_handler(p):
-        output = p.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_memory_usage", row_limit=20)
-        print(output)
-        p.export_chrome_trace("./trace_" + str(p.step_num) + ".json")
-
     with torch.profiler.profile(
         activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-        # schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
-        on_trace_ready=torch.profiler.tensorboard_trace_handler(
-            "./log/exp900a01_fp8_tp2_and_mbs_16",
-            # use_gzip=True
-        ),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler(f"./log/{exp_name}"),
         record_shapes=True,
         profile_memory=True,
         with_stack=True,
-        # with_flops=True,
         with_modules=True,
-        # on_trace_ready=trace_handler,
         experimental_config=torch._C._profiler._ExperimentalConfig(verbose=True),
         use_cuda=True,
     ) as prof:
         prof.step()
-        sharded_output = column_linear(sharded_random_input)
+        sharded_output = column_linear(input)
         sharded_output.sum().backward()
 
-
-# if dist.get_rank() == 0:
-# print("sharded_output.dtype: ", sharded_output.dtype)
-# print(prof.key_averages(group_by_stack_n=5).table(sort_by="cuda_time_total", row_limit=20, top_level_events_only=False))
-
-# # Print detailed events with stack traces
-# for event in prof.events():
-#     if 'aten::copy_' in event.name:
-#         print(f"\nOperation: {event.name}")
-#         print(f"CUDA time: {event.cuda_time_total/1000:.2f}ms")
-#         if event.stack:  # Print full stack trace
-#             print("Stack trace:")
-#             try:
-#                 # Print the raw stack information
-#                 print("Raw stack info:", event.stack)
-
-#                 # Safely iterate through stack frames
-#                 for frame in event.stack:
-#                     if isinstance(frame, (list, tuple)):
-#                         print("  ", " - ".join(str(x) for x in frame))
-#                     else:
-#                         print("  ", frame)
-#             except Exception as e:
-#                 print(f"Error printing stack: {e}")
-#                 print("Raw stack data:", repr(event.stack))
-#         print("-" * 80)
-
-
-print(
-    prof.key_averages(group_by_stack_n=1000).table(
-        sort_by="self_cuda_time_total",
-        row_limit=20,
-        max_src_column_width=100,  # Increase source column width
-        top_level_events_only=False,
-        # max_name_column_width=1000,
-        max_name_column_width=120,
+    torch.cuda.synchronize()
+    return prof
+
+
+def print_profiling_table(prof, sort_by="cpu_time_total"):
+    print(f"###### sorted by {sort_by} ######")
+    print(
+        prof.key_averages(group_by_stack_n=100).table(
+            sort_by=sort_by,
+            row_limit=20,
+            max_src_column_width=2000,  # Increase source column width
+            top_level_events_only=False,
+            max_name_column_width=2000,
+            max_shapes_column_width=1000,
+        )
     )
-)
 
-# print(prof.key_averages(group_by_stack_n=5).table(
-#     sort_by="self_cuda_time_total",
-#     row_limit=20,
-#     max_src_column_width=100,  # Increase source column width
-#     top_level_events_only=False
-# ))
 
-# prof.export_stacks("_x", "self_cuda_time_total")
+def explore_event_values(event):
+    for attr in dir(event):
+        if not attr.startswith("_"):  # Skip internal attributes
+            try:
+                value = getattr(event, attr)
+                if callable(value):  # Skip methods
+                    continue
+                print(f"\n{attr}:")
+                print(value)
+                print("-" * 50)  # Separator for better readability
+            except Exception:
+                print(f"{attr}: <error accessing attribute>")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run profiling experiments with configurable dimensions")
+    parser.add_argument("--exp_number", type=str, help="Experiment number")
+    parser.add_argument("--tp_size", type=int, default=1, help="Tensor Parallel size")
+    parser.add_argument(
+        "--dimensions",
+        type=str,
+        default="1024,2048,4096,8192,16384,32768",
+        help="Comma-separated list of dimensions to test",
+    )
+    return parser.parse_args()
 
-# for event in prof.events():
-#     print(event.name, event.stack)
 
-# prof.export_chrome_trace("trace.json")
+if __name__ == "__main__":
+    torch.backends.cudnn.benchmark = True
+
+    args = parse_args()
+
+    # Parse dimensions from comma-separated string to list of integers
+    dimensions = [int(d.strip()) for d in args.dimensions.split(",")]
+    TP_SIZE = args.tp_size
+    EXP_NUMBER = args.exp_number
+
+    # dimensions = [1024, 2048, 4096, 8192, 16384]
+    # TP_SIZE = 8
+
+    results = []
+    total = len(list(itertools.product(dimensions, dimensions, dimensions)))
+    experiment_count = 0
+    parallel_context = ParallelContext(data_parallel_size=1, pipeline_parallel_size=1, tensor_parallel_size=TP_SIZE)
+
+    for M, N, K in itertools.product(dimensions, dimensions, dimensions):
+        exp_name = f"{EXP_NUMBER}_fp8_m{M}_n{N}_k{K}_and_tp{TP_SIZE}"
+        total += 1
+        print(f"Running experiment with M={M}, N={N}, K={K}, {experiment_count}/{total}")
+
+        prof = run_experiment(exp_name, M, N, K, TP_SIZE=TP_SIZE, parallel_context=parallel_context)
+
+        if dist.get_rank() == 0:
+            print_profiling_table(prof, sort_by="cpu_time_total")
+            print_profiling_table(prof, sort_by="cuda_time_total")
+            # explore_event_values(table)
+
+            # Get top 5 operations by CPU time
+            # sorted_events = prof.key_averages().table(sort_by="cpu_time_total")
+
+            # NOTE: loop through all events and sum up the total time, then calculate the percent
+            averages = prof.key_averages(group_by_stack_n=100)
+            # NOTE: why sum .self_cpu_time_total instead of .cpu_time_total?
+            # source: https://github.com/pytorch/pytorch/blob/f14f245747db2f80e963bd72561f5bd5ed216a4a/torch/autograd/profiler_util.py#L976-L990
+            # i test and it matches the torch's generated table
+            cpu_time_total_of_all_events = sum([event.self_cpu_time_total for event in averages])
+            sorted_events = sorted(averages, key=lambda x: x.cpu_time_total, reverse=True)[:5]
+
+            for event in sorted_events:
+                event_cpu_time_percent = (event.cpu_time_total / cpu_time_total_of_all_events) * 100
+
+                results.append(
+                    {
+                        "M": M,
+                        "N": N,
+                        "K": K,
+                        "Operation": event.key,
+                        "CPU Time (ms)": event.cpu_time_total / 1000,  # Convert to milliseconds
+                        "CPU Time %": f"{event_cpu_time_percent:.2f}%",
+                        "CUDA Time (ms)": event.cuda_time_total / 1000,  # Convert to milliseconds
+                        # 'Memory Used (MB)': event.cpu_memory_usage / (1024 * 1024) if event.cpu_memory_usage else 0
+                    }
+                )
+
+if dist.get_rank() == 0:
+    df = pd.DataFrame(results)
+    print("\nTop 5 most time-consuming operations for each dimension combination:")
+    print(df.to_string())
+    df.to_csv(
+        f'{EXP_NUMBER}_profiling_results_with_m_n_k_with_cartesian_product_{"_".join(map(str, dimensions))}.csv',
+        index=False,
+    )