Skip to content

Commit

Permalink
fixed formula
Browse files Browse the repository at this point in the history
  • Loading branch information
hariharan-devarajan committed Oct 5, 2024
1 parent 9f12a50 commit c412725
Showing 1 changed file with 14 additions and 6 deletions.
20 changes: 14 additions & 6 deletions tests/dlio_benchmark_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,8 @@ def test_iostat_profiling() -> None:
("pytorch", 1024, [1024, 128], 2, [16], "all_ranks"),
("tensorflow", 1024, [1024, 128], 2, [16], "rank_zero"),
("pytorch", 1024, [1024, 128], 2, [16], "rank_zero"),
("tensorflow", 1024, [128], 1, [], "all_ranks"),
("pytorch", 1024, [128], 1, [], "all_ranks")])
("tensorflow", 1024, [128], 1, [16], "all_ranks"),
("pytorch", 1024, [128], 1, [16], "all_ranks")])
def test_checkpoint_epoch(framework, model_size, optimizers, num_layers, layer_params, type) -> None:
init()
clean()
Expand All @@ -239,15 +239,17 @@ def test_checkpoint_epoch(framework, model_size, optimizers, num_layers, layer_p
logging.info(f" DLIO test for checkpointing at the end of epochs")
logging.info("=" * 80)
with initialize_config_dir(version_base=None, config_dir=config_dir):
epochs = 8
epoch_per_ckp = 2
cfg = compose(config_name='config',
overrides=[f'++workload.framework={framework}',
f'++workload.reader.data_loader={framework}',
'++workload.workflow.train=True',
'++workload.workflow.generate_data=True',
'++workload.train.computation_time=0.01',
'++workload.evaluation.eval_time=0.005',
'++workload.train.epochs=8', '++workload.workflow.checkpoint=True',
'++workload.checkpoint.epochs_between_checkpoints=2',
f'++workload.train.epochs={epochs}', '++workload.workflow.checkpoint=True',
f'++workload.checkpoint.epochs_between_checkpoints={epoch_per_ckp}',
f'++workload.checkpoint.type={type}',
f'++workload.checkpoint.model_size={model_size}',
f'++workload.checkpoint.optimization_groups={optimizers}',
Expand All @@ -267,11 +269,17 @@ def test_checkpoint_epoch(framework, model_size, optimizers, num_layers, layer_p
nranks = 1
if type == "all_ranks":
nranks = comm.size
num_model_files = 1
num_optimizer_files = 1
num_layer_files = num_layers
files_per_checkpoint = (num_model_files + num_optimizer_files + num_layer_files) * nranks
comm.Barrier()
if framework == "tensorflow":
num_check_files = 8 / 2 * (2 + 2 + 2*n) * nranks + 1
file_per_ckp = 2
num_check_files = epochs / epoch_per_ckp * files_per_checkpoint * file_per_ckp + 1
assert (len(load_bin) == num_check_files), f"files produced are {len(load_bin)} {num_check_files} {load_bin} "
if framework == "pytorch":
num_check_files = 8 / 2 * (1 + 1 + n) * nranks
num_check_files = epochs / epoch_per_ckp * files_per_checkpoint
assert (len(load_bin) == num_check_files), f"files produced are {len(load_bin)} {num_check_files} {load_bin}"
comm.Barrier()
if comm.rank == 0:
Expand Down

0 comments on commit c412725

Please sign in to comment.