From 58c220eabb701826bc719e3d0a60e88d3bc3ac83 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 6 Feb 2025 03:02:43 -0600 Subject: [PATCH] added llama models --- dlio_benchmark/configs/workload/llama_1t.yaml | 46 +++++++++++++++++++ .../configs/workload/llama_405b.yaml | 3 +- .../configs/workload/llama_70b.yaml | 1 + .../configs/workload/megatron_deepspeed.yaml | 6 ++- 4 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 dlio_benchmark/configs/workload/llama_1t.yaml diff --git a/dlio_benchmark/configs/workload/llama_1t.yaml b/dlio_benchmark/configs/workload/llama_1t.yaml new file mode 100644 index 00000000..fc736540 --- /dev/null +++ b/dlio_benchmark/configs/workload/llama_1t.yaml @@ -0,0 +1,46 @@ +# we mimic the checkpoint data for megatron-deepspeed +model: + name: llama_405b + type: transformer + model_size: 30102 + num_layers: 128 + parallelism: + tensor: 8 + pipeline: 64 + zero_stage: 1 + transformer: + vocab_size: 128000 + hidden_size: 25872 + ffn_hidden_size: 98304 + +framework: pytorch + +workflow: + generate_data: True + train: True + checkpoint: True + +dataset: + data_folder: data/llama_405b/ + format: mmap_indexed_binary + num_files_train: 1 + num_samples_per_file: 1048576 + record_length: 2048 + +reader: + data_loader: pytorch + batch_size: 16 + read_threads: 1 + file_shuffle: seed + sample_shuffle: seed + +train: + epochs: 3 + computation_time: 5 # 2.44 sec per step + total_training_steps: 5 + + +checkpoint: + checkpoint_folder: checkpoints/llama_405b + steps_between_checkpoints: 1 + type: all_ranks diff --git a/dlio_benchmark/configs/workload/llama_405b.yaml b/dlio_benchmark/configs/workload/llama_405b.yaml index 3a07e75a..fb42e283 100644 --- a/dlio_benchmark/configs/workload/llama_405b.yaml +++ b/dlio_benchmark/configs/workload/llama_405b.yaml @@ -3,7 +3,7 @@ model: name: llama_405b type: transformer model_size: 30102 - num_layers: 2 + num_layers: 126 parallelism: tensor: 8 pipeline: 16 @@ -11,7 +11,6 @@ model: transformer: vocab_size: 128000 hidden_size: 16384 - num_layers: 126 ffn_hidden_size: 53248 framework: pytorch diff --git a/dlio_benchmark/configs/workload/llama_70b.yaml b/dlio_benchmark/configs/workload/llama_70b.yaml index 66185c48..6cc0121d 100644 --- a/dlio_benchmark/configs/workload/llama_70b.yaml +++ b/dlio_benchmark/configs/workload/llama_70b.yaml @@ -1,3 +1,4 @@ +# we mimic the checkpoint data for megatron-deepspeed model: name: llama_70b type: transformer diff --git a/dlio_benchmark/configs/workload/megatron_deepspeed.yaml b/dlio_benchmark/configs/workload/megatron_deepspeed.yaml index 588d1e82..d5b8ff4f 100644 --- a/dlio_benchmark/configs/workload/megatron_deepspeed.yaml +++ b/dlio_benchmark/configs/workload/megatron_deepspeed.yaml @@ -4,8 +4,10 @@ model: optimization_groups: [1009254400, 865075200, 793600] model_size: 30102 num_layers: 40 - pipeline_parallelism: 8 - tensor_parallelism: 4 + parallelism: + pipeline: 8 + tensor: 4 + zero_stage: -1 layer_parameters: [52583936, 209715200] framework: pytorch