added llama models

argonne-lcf · Feb 6, 2025 · 58c220e · 58c220e
1 parent d25d730
commit 58c220e
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 4 deletions.
diff --git a/dlio_benchmark/configs/workload/llama_1t.yaml b/dlio_benchmark/configs/workload/llama_1t.yaml
@@ -0,0 +1,46 @@
+# we mimic the checkpoint data for megatron-deepspeed
+model: 
+  name: llama_405b
+  type: transformer
+  model_size: 30102
+  num_layers: 128
+  parallelism: 
+    tensor: 8
+    pipeline: 64
+    zero_stage: 1
+  transformer: 
+    vocab_size: 128000
+    hidden_size: 25872
+    ffn_hidden_size: 98304
+
+framework: pytorch
+
+workflow:
+  generate_data: True
+  train: True
+  checkpoint: True
+
+dataset: 
+  data_folder: data/llama_405b/
+  format: mmap_indexed_binary
+  num_files_train: 1
+  num_samples_per_file: 1048576
+  record_length: 2048
+
+reader: 
+  data_loader: pytorch
+  batch_size: 16
+  read_threads: 1
+  file_shuffle: seed
+  sample_shuffle: seed
+
+train:
+  epochs: 3
+  computation_time: 5 # 2.44 sec per step
+  total_training_steps: 5
+
+
+checkpoint:
+  checkpoint_folder: checkpoints/llama_405b
+  steps_between_checkpoints: 1
+  type: all_ranks
diff --git a/dlio_benchmark/configs/workload/llama_405b.yaml b/dlio_benchmark/configs/workload/llama_405b.yaml
@@ -3,15 +3,14 @@ model:
   name: llama_405b
   type: transformer
   model_size: 30102
-  num_layers: 2
+  num_layers: 126
   parallelism: 
     tensor: 8
     pipeline: 16
     zero_stage: 1
   transformer: 
     vocab_size: 128000
     hidden_size: 16384
-    num_layers: 126
     ffn_hidden_size: 53248
 
 framework: pytorch

diff --git a/dlio_benchmark/configs/workload/llama_70b.yaml b/dlio_benchmark/configs/workload/llama_70b.yaml
@@ -1,3 +1,4 @@
+# we mimic the checkpoint data for megatron-deepspeed
 model: 
   name: llama_70b
   type: transformer

diff --git a/dlio_benchmark/configs/workload/megatron_deepspeed.yaml b/dlio_benchmark/configs/workload/megatron_deepspeed.yaml
@@ -4,8 +4,10 @@ model:
   optimization_groups: [1009254400, 865075200, 793600]
   model_size: 30102
   num_layers: 40
-  pipeline_parallelism: 8
-  tensor_parallelism: 4
+  parallelism: 
+    pipeline: 8
+    tensor: 4
+    zero_stage: -1
   layer_parameters: [52583936, 209715200]
 
 framework: pytorch