-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added checkpointing to support LLMs (#114)
* added checkpointing to support LLMs * added indexed binary data support for LLMs. * added configuration for megatron deepspeed. * fixes for out of core data generation * fixes for out of core data generation * fixes for out of core data generation * added dlrm configuration * added changes to support mmapped file. * added changes to support mmapped file. * added changes to support mmapped file. * added changes to support mmapped file. * added changes to support mmapped file. * fixed checkpointing for tensors * Update torch_framework.py Fix rank for merge bug. * Update indexed_binary_generator.py Change GB to a abs value. * Update megatron_deepspeed.yaml * refactor enum for better naming * documentation for the checkpointing. * make data generation buffer_size configurable. * Update tf_framework.py Args model size * Update tf_framework.py * Update megatron_deepspeed.yaml * Update megatron_deepspeed.yaml * make data generation buffer_size configurable.
- Loading branch information
1 parent
0720984
commit 0a6130a
Showing
17 changed files
with
647 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
model: dlrm | ||
|
||
framework: pytorch | ||
|
||
workflow: | ||
generate_data: False | ||
train: True | ||
do_eval: True | ||
|
||
dataset: | ||
data_folder: data/dlrm | ||
format: indexed_binary | ||
num_files_train: 1 | ||
num_files_eval: 1 | ||
num_samples_per_file: 4195198976 | ||
record_length: 327680 | ||
keep_files: True | ||
eval_num_samples_per_file: 91681240 | ||
|
||
reader: | ||
data_loader: pytorch | ||
batch_size: 2048 | ||
batch_size_eval: 16384 | ||
sample_shuffle: random | ||
|
||
train: | ||
epochs: 1 | ||
computation_time: 0.064296 | ||
total_training_steps: 32768 | ||
total_eval_steps: 2048 | ||
|
||
evaluation: | ||
eval_time: 0.0843 | ||
steps_between_evals: 16384 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# 8 node run with 4 GPUs per node and TPSIZE=4 and PPSIZE=8 | ||
model: megatron_deepspeed | ||
|
||
framework: pytorch | ||
|
||
workflow: | ||
generate_data: False | ||
train: True | ||
checkpoint: True | ||
|
||
dataset: | ||
data_folder: dataset/megatron-deepspeed/ | ||
format: mmap_indexed_binary | ||
num_files_train: 1 | ||
num_samples_per_file: 277203535 | ||
record_length: 2048 | ||
|
||
reader: | ||
data_loader: pytorch | ||
batch_size: 1024 | ||
read_threads: 1 | ||
file_shuffle: seed | ||
sample_shuffle: seed | ||
|
||
train: | ||
epochs: 311541 | ||
computation_time: 0.03 # every iteration has 290 steps and each iteration is 8.9 sec. | ||
|
||
checkpoint: | ||
checkpoint_folder: checkpoints/megatron-deepspeed | ||
steps_between_checkpoints: 1000 | ||
model_size: 30102 | ||
type: all_ranks | ||
optimization_groups: [1009254400, 865075200, 793600] | ||
num_layers: 44 | ||
layer_parameters: [129761280, 20971520] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
""" | ||
Copyright (c) 2022, UChicago Argonne, LLC | ||
All Rights Reserved | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
""" | ||
|
||
from dlio_benchmark.common.enumerations import Compression | ||
from dlio_benchmark.data_generator.data_generator import DataGenerator | ||
|
||
import logging | ||
import numpy as np | ||
|
||
from dlio_benchmark.utils.utility import progress, utcnow | ||
from dlio_profiler.logger import fn_interceptor as Profile | ||
from shutil import copyfile | ||
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR | ||
import struct | ||
|
||
dlp = Profile(MODULE_DATA_GENERATOR) | ||
|
||
""" | ||
Generator for creating data in NPZ format. | ||
""" | ||
class IndexedBinaryGenerator(DataGenerator): | ||
def __init__(self): | ||
super().__init__() | ||
|
||
def index_file_path_off(self, prefix_path): | ||
return prefix_path + '.off.idx' | ||
|
||
def index_file_path_size(self, prefix_path): | ||
return prefix_path + '.sz.idx' | ||
|
||
@dlp.log | ||
def generate(self): | ||
""" | ||
Generator for creating data in NPZ format of 3d dataset. | ||
""" | ||
super().generate() | ||
np.random.seed(10) | ||
GB=1073741824 | ||
for i in dlp.iter(range(self.my_rank, int(self.total_files_to_generate), self.comm_size)): | ||
dim1, dim2 = self.get_dimension() | ||
sample_size = dim1 * dim2 | ||
total_size = sample_size * self.num_samples | ||
write_size = total_size | ||
memory_size = self._args.generation_buffer_size | ||
if total_size > memory_size: | ||
write_size = memory_size - (memory_size % sample_size) | ||
out_path_spec = self.storage.get_uri(self._file_list[i]) | ||
out_path_spec_off_idx = self.index_file_path_off(out_path_spec) | ||
out_path_spec_sz_idx = self.index_file_path_size(out_path_spec) | ||
progress(i + 1, self.total_files_to_generate, "Generating Indexed Binary Data") | ||
prev_out_spec = out_path_spec | ||
written_bytes = 0 | ||
data_file = open(out_path_spec, "wb") | ||
off_file = open(out_path_spec_off_idx, "wb") | ||
sz_file = open(out_path_spec_sz_idx, "wb") | ||
records = np.random.randint(255, size=write_size, dtype=np.uint8) | ||
while written_bytes < total_size: | ||
data_to_write = write_size if written_bytes + write_size <= total_size else total_size - written_bytes | ||
samples_to_write = data_to_write // sample_size | ||
|
||
# Write data | ||
myfmt = 'B' * data_to_write | ||
binary_data = struct.pack(myfmt, *records[:data_to_write]) | ||
data_file.write(binary_data) | ||
|
||
# Write offsets | ||
myfmt = 'Q' * samples_to_write | ||
offsets = range(0, data_to_write, sample_size) | ||
offsets = offsets[:samples_to_write] | ||
binary_offsets = struct.pack(myfmt, *offsets) | ||
off_file.write(binary_offsets) | ||
|
||
# Write sizes | ||
myfmt = 'Q' * samples_to_write | ||
sample_sizes = [sample_size] * samples_to_write | ||
binary_sizes = struct.pack(myfmt, *sample_sizes) | ||
sz_file.write(binary_sizes) | ||
|
||
written_bytes = written_bytes + data_to_write | ||
data_file.close() | ||
off_file.close() | ||
sz_file.close() | ||
np.random.seed() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.