From 7e0bac2d382e0bbb6b758871cb2da829901dc90a Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 31 May 2024 17:22:07 -0500 Subject: [PATCH 01/19] Update config.py (#196) fixed typo --- dlio_benchmark/utils/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index ab793d5b..088ce149 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -461,8 +461,8 @@ def LoadConfig(args, config): args.data_loader_sampler = DataLoaderSampler(reader['data_loader_sampler']) if 'read_threads' in reader: args.read_threads = reader['read_threads'] - if 'computatation_threads' in reader: - args.computatation_threads = reader['computatation_threads'] + if 'computation_threads' in reader: + args.computation_threads = reader['computation_threads'] if 'batch_size' in reader: args.batch_size = reader['batch_size'] if 'batch_size_eval' in reader: From b9e2018b5399c823a4eb777721f96c1b526c82e7 Mon Sep 17 00:00:00 2001 From: Louis Douriez Date: Mon, 3 Jun 2024 07:42:42 -0600 Subject: [PATCH 02/19] Shard filenames instead of images (tfreader) --- dlio_benchmark/reader/tf_reader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index 894fe8af..8c2bcfe5 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -83,9 +83,11 @@ def _parse_image(self, serialized): def next(self): logging.debug( f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}") - self._dataset = tf.data.TFRecordDataset(filenames=self._file_list, buffer_size=self._args.transfer_size, - num_parallel_reads=self._args.read_threads) + filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) + filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, + num_parallel_reads=self._args.read_threads) if self._args.sample_shuffle != Shuffle.OFF: if self._args.sample_shuffle == Shuffle.SEED: self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size, @@ -93,7 +95,6 @@ def next(self): else: self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size) - self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True) self._dataset = self._dataset.map( lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]), From c1719677ba7314054a66c8b196d1779b0b319456 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 3 Jun 2024 09:49:32 -0500 Subject: [PATCH 03/19] Update tf_reader.py --- dlio_benchmark/reader/tf_reader.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index 8c2bcfe5..4ff88a40 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -83,18 +83,24 @@ def _parse_image(self, serialized): def next(self): logging.debug( f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}") - filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) - filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) - - self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, - num_parallel_reads=self._args.read_threads) + filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) + # sharding in the file list if we have another files. + if (len(self._file_list) >= self._args.comm_size): + filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, + num_parallel_reads=self._args.read_threads) + if self._args.sample_shuffle != Shuffle.OFF: if self._args.sample_shuffle == Shuffle.SEED: self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size, seed=self._args.seed) else: self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size) - + + # shard the dataset if it is not done already. + if (len(self._file_list) < self._args.comm_size): + self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True) self._dataset = self._dataset.map( lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]), From 445b3796473b42e4791ffb2ff202c34dfbbfdd05 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 3 Jun 2024 09:51:37 -0500 Subject: [PATCH 04/19] Update tf_reader.py fixed indent issue --- dlio_benchmark/reader/tf_reader.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index 4ff88a40..cec6f945 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -83,12 +83,13 @@ def _parse_image(self, serialized): def next(self): logging.debug( f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}") - filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) - # sharding in the file list if we have another files. + filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) + # sharding in the file list if we have another files. if (len(self._file_list) >= self._args.comm_size): - filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) - self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, - num_parallel_reads=self._args.read_threads) + filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + + self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, + num_parallel_reads=self._args.read_threads) if self._args.sample_shuffle != Shuffle.OFF: if self._args.sample_shuffle == Shuffle.SEED: @@ -97,9 +98,9 @@ def next(self): else: self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size) - # shard the dataset if it is not done already. - if (len(self._file_list) < self._args.comm_size): - self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + # shard the dataset if it is not done already. + if (len(self._file_list) < self._args.comm_size): + self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True) self._dataset = self._dataset.map( From 8c76262819aa07e8f7b268232b84df33e5acb179 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 3 Jun 2024 10:56:07 -0500 Subject: [PATCH 05/19] Update tf_reader.py --- dlio_benchmark/reader/tf_reader.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index cec6f945..d9393d4c 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -83,12 +83,12 @@ def _parse_image(self, serialized): def next(self): logging.debug( f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}") - filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) - # sharding in the file list if we have another files. + filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) + # sharding in the file list if we have another files. if (len(self._file_list) >= self._args.comm_size): - filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) - self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, + self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, num_parallel_reads=self._args.read_threads) if self._args.sample_shuffle != Shuffle.OFF: @@ -98,9 +98,9 @@ def next(self): else: self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size) - # shard the dataset if it is not done already. - if (len(self._file_list) < self._args.comm_size): - self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + # shard the dataset if it is not done already. + if (len(self._file_list) < self._args.comm_size): + self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True) self._dataset = self._dataset.map( From cffc12315a2b6c9efd2e680dfbf8b024554fc295 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 3 Jun 2024 11:08:01 -0500 Subject: [PATCH 06/19] fixed the indent issue --- dlio_benchmark/reader/tf_reader.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index d9393d4c..e9031208 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -84,9 +84,10 @@ def next(self): logging.debug( f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}") filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) - # sharding in the file list if we have another files. + + # sharding in the file list if we have enought files. if (len(self._file_list) >= self._args.comm_size): - filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, num_parallel_reads=self._args.read_threads) @@ -100,7 +101,7 @@ def next(self): # shard the dataset if it is not done already. if (len(self._file_list) < self._args.comm_size): - self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True) self._dataset = self._dataset.map( From 48c1c1e2d9ddee65b536fa387ea8c592f2a05c1e Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 3 Jun 2024 11:17:01 -0500 Subject: [PATCH 07/19] fixed indentation issue --- dlio_benchmark/reader/tf_reader.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index e9031208..bc98d523 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -81,16 +81,14 @@ def _parse_image(self, serialized): @dlp.log def next(self): - logging.debug( - f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}") - filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) - - # sharding in the file list if we have enought files. + logging.debug(f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}") + filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) + # sharding in the file list if we have enought files. if (len(self._file_list) >= self._args.comm_size): - filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) - self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, - num_parallel_reads=self._args.read_threads) + self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, + num_parallel_reads=self._args.read_threads) if self._args.sample_shuffle != Shuffle.OFF: if self._args.sample_shuffle == Shuffle.SEED: @@ -99,9 +97,9 @@ def next(self): else: self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size) - # shard the dataset if it is not done already. - if (len(self._file_list) < self._args.comm_size): - self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + # shard the dataset if it is not done already. + if (len(self._file_list) < self._args.comm_size): + self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True) self._dataset = self._dataset.map( From fbda004c259e4c828a4915c65db7ac11da1d116f Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 7 Jun 2024 00:44:39 -0500 Subject: [PATCH 08/19] added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow --- dlio_benchmark/configs/workload/cosmoflow_a100.yaml | 6 ++++++ dlio_benchmark/configs/workload/cosmoflow_h100.yaml | 8 +++++++- dlio_benchmark/configs/workload/resnet50_a100.yaml | 5 ++++- dlio_benchmark/configs/workload/resnet50_h100.yaml | 5 ++++- dlio_benchmark/configs/workload/unet3d_a100.yaml | 3 +++ dlio_benchmark/configs/workload/unet3d_h100.yaml | 3 +++ dlio_benchmark/utils/config.py | 5 +++++ dlio_benchmark/utils/statscounter.py | 4 ++-- 8 files changed, 34 insertions(+), 5 deletions(-) diff --git a/dlio_benchmark/configs/workload/cosmoflow_a100.yaml b/dlio_benchmark/configs/workload/cosmoflow_a100.yaml index b7247890..28ea5987 100644 --- a/dlio_benchmark/configs/workload/cosmoflow_a100.yaml +++ b/dlio_benchmark/configs/workload/cosmoflow_a100.yaml @@ -18,7 +18,13 @@ reader: data_loader: tensorflow read_threads: 4 batch_size: 1 + file_shuffle: seed + sample_shuffle: seed + shuffle_size: 2 train: epochs: 5 computation_time: 0.00551 + +metric: + au: 0.70 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/cosmoflow_h100.yaml b/dlio_benchmark/configs/workload/cosmoflow_h100.yaml index c26a2a68..79f97772 100644 --- a/dlio_benchmark/configs/workload/cosmoflow_h100.yaml +++ b/dlio_benchmark/configs/workload/cosmoflow_h100.yaml @@ -18,7 +18,13 @@ reader: data_loader: tensorflow read_threads: 4 batch_size: 1 - + file_shuffle: seed + sample_shuffle: seed + shuffle_size: 2 + train: epochs: 5 computation_time: 0.00350 + +metric: + au: 0.70 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/resnet50_a100.yaml b/dlio_benchmark/configs/workload/resnet50_a100.yaml index 5dc66572..acfb2b1f 100644 --- a/dlio_benchmark/configs/workload/resnet50_a100.yaml +++ b/dlio_benchmark/configs/workload/resnet50_a100.yaml @@ -23,4 +23,7 @@ reader: read_threads: 8 computation_threads: 8 batch_size: 400 - dont_use_mmap: True \ No newline at end of file + dont_use_mmap: True + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/resnet50_h100.yaml b/dlio_benchmark/configs/workload/resnet50_h100.yaml index 92095f9f..ef009f1c 100644 --- a/dlio_benchmark/configs/workload/resnet50_h100.yaml +++ b/dlio_benchmark/configs/workload/resnet50_h100.yaml @@ -23,4 +23,7 @@ reader: read_threads: 8 computation_threads: 8 batch_size: 400 - dont_use_mmap: True \ No newline at end of file + + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/unet3d_a100.yaml b/dlio_benchmark/configs/workload/unet3d_a100.yaml index 97b73c99..e26ce454 100644 --- a/dlio_benchmark/configs/workload/unet3d_a100.yaml +++ b/dlio_benchmark/configs/workload/unet3d_a100.yaml @@ -32,3 +32,6 @@ checkpoint: checkpoint_after_epoch: 5 epochs_between_checkpoints: 2 model_size: 499153191 + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/unet3d_h100.yaml b/dlio_benchmark/configs/workload/unet3d_h100.yaml index 389f8d62..b9e4398f 100644 --- a/dlio_benchmark/configs/workload/unet3d_h100.yaml +++ b/dlio_benchmark/configs/workload/unet3d_h100.yaml @@ -32,3 +32,6 @@ checkpoint: checkpoint_after_epoch: 5 epochs_between_checkpoints: 2 model_size: 499153191 + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index 088ce149..b051f3e9 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -133,6 +133,7 @@ class ConfigArguments: training_steps: int = 0 eval_steps: int = 0 samples_per_thread: int = 1 + au: float = 0.90 file_map = None global_index_map = None data_loader_class = None @@ -574,3 +575,7 @@ def LoadConfig(args, config): args.iostat_devices = config['profiling']['iostat_devices'] if isinstance(args.iostat_devices, str): args.iostat_devices = [args.iostat_devices] + + if 'metric' in config: + if 'au' in config['metric']: + args.au = config['metric']['au'] diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py index 8ba652e8..5d2c8823 100644 --- a/dlio_benchmark/utils/statscounter.py +++ b/dlio_benchmark/utils/statscounter.py @@ -142,7 +142,7 @@ def end_run(self): self.summary['epochs'] = len(train_au) self.summary['metric']['train_au_percentage'] = list(train_au) self.summary['metric']['train_au_mean_percentage'] = np.mean(train_au) - if self.summary['metric']['train_au_mean_percentage'] >=90: + if self.summary['metric']['train_au_mean_percentage'] >=self.args.au*100: self.summary['metric']['train_au_meet_expectation'] = 'success' else: self.summary['metric']['train_au_meet_expectation'] = 'fail' @@ -157,7 +157,7 @@ def end_run(self): eval_throughput = self.comm.allreduce(self.eval_throughput) self.summary['metric']['eval_au_percentage'] = list(eval_au) self.summary['metric']['eval_au_mean_percentage'] = np.mean(eval_au) - if self.summary['metric']['eval_au_mean_percentage'] >=90: + if self.summary['metric']['eval_au_mean_percentage'] >=self.args.au*100: self.summary['metric']['eval_au_meet_expectation'] = 'success' else: self.summary['metric']['eval_au_meet_expectation'] = 'fail' From 03bded3de3d8353234ffd33eec5c2bf53d8322fb Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 7 Jun 2024 01:03:09 -0500 Subject: [PATCH 09/19] removed dependencies on dlioprofiler --- .../checkpointing/pytorch_checkpointing.py | 2 +- .../checkpointing/tf_checkpointing.py | 2 +- .../data_generator/hdf5_generator.py | 2 +- .../data_generator/indexed_binary_generator.py | 2 +- .../data_generator/jpeg_generator.py | 2 +- dlio_benchmark/data_generator/npy_generator.py | 2 +- dlio_benchmark/data_generator/npz_generator.py | 2 +- dlio_benchmark/data_generator/png_generator.py | 2 +- .../data_generator/synthetic_generator.py | 2 +- dlio_benchmark/data_generator/tf_generator.py | 2 +- dlio_benchmark/data_loader/dali_data_loader.py | 2 +- .../data_loader/synthetic_data_loader.py | 2 +- dlio_benchmark/data_loader/tf_data_loader.py | 2 +- .../data_loader/torch_data_loader.py | 2 +- dlio_benchmark/framework/tf_framework.py | 2 +- dlio_benchmark/framework/torch_framework.py | 2 +- dlio_benchmark/main.py | 2 +- dlio_benchmark/reader/csv_reader.py | 2 +- dlio_benchmark/reader/hdf5_reader.py | 2 +- dlio_benchmark/reader/image_reader.py | 2 +- .../reader/indexed_binary_mmap_reader.py | 2 +- dlio_benchmark/reader/indexed_binary_reader.py | 2 +- dlio_benchmark/reader/npy_reader.py | 2 +- dlio_benchmark/reader/npz_reader.py | 2 +- dlio_benchmark/reader/reader_handler.py | 2 +- dlio_benchmark/reader/synthetic_reader.py | 2 +- dlio_benchmark/reader/tf_reader.py | 2 +- dlio_benchmark/storage/file_storage.py | 2 +- dlio_benchmark/storage/s3_storage.py | 2 +- dlio_benchmark/utils/utility.py | 18 ++++++++++++++++++ 30 files changed, 47 insertions(+), 29 deletions(-) diff --git a/dlio_benchmark/checkpointing/pytorch_checkpointing.py b/dlio_benchmark/checkpointing/pytorch_checkpointing.py index 6c52e733..156d3dc7 100644 --- a/dlio_benchmark/checkpointing/pytorch_checkpointing.py +++ b/dlio_benchmark/checkpointing/pytorch_checkpointing.py @@ -18,7 +18,7 @@ import torch from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from dlio_benchmark.common.constants import MODULE_CHECKPOINT from dlio_benchmark.common.enumerations import CheckpointLocationType diff --git a/dlio_benchmark/checkpointing/tf_checkpointing.py b/dlio_benchmark/checkpointing/tf_checkpointing.py index 9cbc44c6..a69fe8fa 100644 --- a/dlio_benchmark/checkpointing/tf_checkpointing.py +++ b/dlio_benchmark/checkpointing/tf_checkpointing.py @@ -17,7 +17,7 @@ import os from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile import tensorflow as tf from dlio_benchmark.common.constants import MODULE_CHECKPOINT diff --git a/dlio_benchmark/data_generator/hdf5_generator.py b/dlio_benchmark/data_generator/hdf5_generator.py index ace59fae..e5d33450 100644 --- a/dlio_benchmark/data_generator/hdf5_generator.py +++ b/dlio_benchmark/data_generator/hdf5_generator.py @@ -21,7 +21,7 @@ from dlio_benchmark.common.enumerations import Compression from dlio_benchmark.data_generator.data_generator import DataGenerator from dlio_benchmark.utils.utility import progress -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/indexed_binary_generator.py b/dlio_benchmark/data_generator/indexed_binary_generator.py index 7f16032e..790095af 100644 --- a/dlio_benchmark/data_generator/indexed_binary_generator.py +++ b/dlio_benchmark/data_generator/indexed_binary_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow, DLIOMPI -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR import struct diff --git a/dlio_benchmark/data_generator/jpeg_generator.py b/dlio_benchmark/data_generator/jpeg_generator.py index fdf21e4c..09b6f87d 100644 --- a/dlio_benchmark/data_generator/jpeg_generator.py +++ b/dlio_benchmark/data_generator/jpeg_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/npy_generator.py b/dlio_benchmark/data_generator/npy_generator.py index de9f27bb..45584193 100644 --- a/dlio_benchmark/data_generator/npy_generator.py +++ b/dlio_benchmark/data_generator/npy_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/npz_generator.py b/dlio_benchmark/data_generator/npz_generator.py index 74ecdd1e..fb53559d 100644 --- a/dlio_benchmark/data_generator/npz_generator.py +++ b/dlio_benchmark/data_generator/npz_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/png_generator.py b/dlio_benchmark/data_generator/png_generator.py index 48343bbf..47874f3c 100644 --- a/dlio_benchmark/data_generator/png_generator.py +++ b/dlio_benchmark/data_generator/png_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/synthetic_generator.py b/dlio_benchmark/data_generator/synthetic_generator.py index 5135d7f9..be47f4b0 100644 --- a/dlio_benchmark/data_generator/synthetic_generator.py +++ b/dlio_benchmark/data_generator/synthetic_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/tf_generator.py b/dlio_benchmark/data_generator/tf_generator.py index c4da8616..835240ec 100644 --- a/dlio_benchmark/data_generator/tf_generator.py +++ b/dlio_benchmark/data_generator/tf_generator.py @@ -20,7 +20,7 @@ from dlio_benchmark.data_generator.data_generator import DataGenerator import numpy as np import tensorflow as tf -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from dlio_benchmark.utils.utility import progress, utcnow from shutil import copyfile diff --git a/dlio_benchmark/data_loader/dali_data_loader.py b/dlio_benchmark/data_loader/dali_data_loader.py index d7deb921..f10cd420 100644 --- a/dlio_benchmark/data_loader/dali_data_loader.py +++ b/dlio_benchmark/data_loader/dali_data_loader.py @@ -27,7 +27,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile import os dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/data_loader/synthetic_data_loader.py b/dlio_benchmark/data_loader/synthetic_data_loader.py index ce54b54e..828db569 100644 --- a/dlio_benchmark/data_loader/synthetic_data_loader.py +++ b/dlio_benchmark/data_loader/synthetic_data_loader.py @@ -27,7 +27,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile import os dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/data_loader/tf_data_loader.py b/dlio_benchmark/data_loader/tf_data_loader.py index edf01d30..7d81d17a 100644 --- a/dlio_benchmark/data_loader/tf_data_loader.py +++ b/dlio_benchmark/data_loader/tf_data_loader.py @@ -25,7 +25,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile import numpy as np diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py index e2ae4f95..bacd8ed7 100644 --- a/dlio_benchmark/data_loader/torch_data_loader.py +++ b/dlio_benchmark/data_loader/torch_data_loader.py @@ -29,7 +29,7 @@ from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow, DLIOMPI from dlio_benchmark.utils.config import ConfigArguments -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/framework/tf_framework.py b/dlio_benchmark/framework/tf_framework.py index 2e21e151..115b9077 100644 --- a/dlio_benchmark/framework/tf_framework.py +++ b/dlio_benchmark/framework/tf_framework.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_AI_FRAMEWORK from dlio_benchmark.data_loader.data_loader_factory import DataLoaderFactory from dlio_benchmark.utils.utility import utcnow, DLIOMPI -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from dlio_benchmark.common.error_code import ErrorCodes from dlio_benchmark.framework.framework import Framework from dlio_benchmark.reader.reader_factory import ReaderFactory diff --git a/dlio_benchmark/framework/torch_framework.py b/dlio_benchmark/framework/torch_framework.py index 8660914c..2adddadf 100644 --- a/dlio_benchmark/framework/torch_framework.py +++ b/dlio_benchmark/framework/torch_framework.py @@ -25,7 +25,7 @@ import functools import logging from dlio_benchmark.utils.utility import utcnow, DLIOMPI -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from time import sleep, time diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py index ebeb5a9f..8dbfd6ed 100644 --- a/dlio_benchmark/main.py +++ b/dlio_benchmark/main.py @@ -46,7 +46,7 @@ from dlio_benchmark.framework.framework_factory import FrameworkFactory from dlio_benchmark.data_generator.generator_factory import GeneratorFactory from dlio_benchmark.storage.storage_factory import StorageFactory -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile +from utils.utility import Profile, PerfTrace dlp = Profile(MODULE_DLIO_BENCHMARK) diff --git a/dlio_benchmark/reader/csv_reader.py b/dlio_benchmark/reader/csv_reader.py index 75ca7577..cf1d0ff1 100644 --- a/dlio_benchmark/reader/csv_reader.py +++ b/dlio_benchmark/reader/csv_reader.py @@ -17,7 +17,7 @@ import pandas as pd from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from dlio_benchmark.reader.reader_handler import FormatReader dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/hdf5_reader.py b/dlio_benchmark/reader/hdf5_reader.py index c2ffe7ec..a5852dbd 100644 --- a/dlio_benchmark/reader/hdf5_reader.py +++ b/dlio_benchmark/reader/hdf5_reader.py @@ -19,7 +19,7 @@ import h5py from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from dlio_benchmark.reader.reader_handler import FormatReader dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/image_reader.py b/dlio_benchmark/reader/image_reader.py index 6466ad5f..72fc0b88 100644 --- a/dlio_benchmark/reader/image_reader.py +++ b/dlio_benchmark/reader/image_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/indexed_binary_mmap_reader.py b/dlio_benchmark/reader/indexed_binary_mmap_reader.py index 7dce843c..af19308c 100644 --- a/dlio_benchmark/reader/indexed_binary_mmap_reader.py +++ b/dlio_benchmark/reader/indexed_binary_mmap_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.common.enumerations import DataLoaderSampler from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/indexed_binary_reader.py b/dlio_benchmark/reader/indexed_binary_reader.py index 905ec337..5cdf7b65 100644 --- a/dlio_benchmark/reader/indexed_binary_reader.py +++ b/dlio_benchmark/reader/indexed_binary_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.common.enumerations import DataLoaderSampler from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/npy_reader.py b/dlio_benchmark/reader/npy_reader.py index 73955a4d..d5ac63c6 100644 --- a/dlio_benchmark/reader/npy_reader.py +++ b/dlio_benchmark/reader/npy_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py index 685a1815..2533f5c1 100644 --- a/dlio_benchmark/reader/npz_reader.py +++ b/dlio_benchmark/reader/npz_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index ff459cf2..ea9d0ec2 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -21,7 +21,7 @@ from dlio_benchmark.framework.framework_factory import FrameworkFactory from dlio_benchmark.storage.storage_factory import StorageFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from dlio_benchmark.utils.config import ConfigArguments import numpy as np import os diff --git a/dlio_benchmark/reader/synthetic_reader.py b/dlio_benchmark/reader/synthetic_reader.py index 5114a0ce..58e9ab81 100644 --- a/dlio_benchmark/reader/synthetic_reader.py +++ b/dlio_benchmark/reader/synthetic_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index bc98d523..311ef7ac 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -20,7 +20,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile from dlio_benchmark.common.enumerations import DatasetType, Shuffle from dlio_benchmark.reader.reader_handler import FormatReader import tensorflow as tf diff --git a/dlio_benchmark/storage/file_storage.py b/dlio_benchmark/storage/file_storage.py index 26ae8a0b..cc31d30c 100644 --- a/dlio_benchmark/storage/file_storage.py +++ b/dlio_benchmark/storage/file_storage.py @@ -24,7 +24,7 @@ import glob import shutil -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile dlp = Profile(MODULE_STORAGE) diff --git a/dlio_benchmark/storage/s3_storage.py b/dlio_benchmark/storage/s3_storage.py index 8fbaeeda..1cdb5fb3 100644 --- a/dlio_benchmark/storage/s3_storage.py +++ b/dlio_benchmark/storage/s3_storage.py @@ -21,7 +21,7 @@ from dlio_benchmark.common.enumerations import NamespaceType, MetadataType import os -from dlio_profiler.logger import fn_interceptor as Profile +from utils.utility import Profile dlp = Profile(MODULE_STORAGE) diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py index f760255f..4a04956b 100644 --- a/dlio_benchmark/utils/utility.py +++ b/dlio_benchmark/utils/utility.py @@ -33,6 +33,24 @@ import importlib.util # UTC timestamp format with microsecond precision from dlio_benchmark.common.enumerations import LoggerType, MPIState +try: + from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile +except: + class Profile: + def __init__(self, name=None, cat=None): + self.type = type + def log(self, func): + return func + def iter(self, a): + return a + class dlio_logger: + def __init__(self,): + self.type = None + def initialize_log(self, logfile=None, data_dir=None, process_id=-1): + return + def iter(self, a): + return a + PerfTrace = dlio_logger() LOG_TS_FORMAT = "%Y-%m-%dT%H:%M:%S.%f" From 4e3ee0dd0da88c02f2dfe37126d7743558467238 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 7 Jun 2024 01:13:46 -0500 Subject: [PATCH 10/19] fixed bugs --- dlio_benchmark/checkpointing/pytorch_checkpointing.py | 2 +- dlio_benchmark/checkpointing/tf_checkpointing.py | 2 +- dlio_benchmark/data_generator/hdf5_generator.py | 2 +- dlio_benchmark/data_generator/indexed_binary_generator.py | 2 +- dlio_benchmark/data_generator/jpeg_generator.py | 2 +- dlio_benchmark/data_generator/npy_generator.py | 2 +- dlio_benchmark/data_generator/npz_generator.py | 2 +- dlio_benchmark/data_generator/png_generator.py | 2 +- dlio_benchmark/data_generator/synthetic_generator.py | 2 +- dlio_benchmark/data_generator/tf_generator.py | 2 +- dlio_benchmark/data_loader/dali_data_loader.py | 2 +- dlio_benchmark/data_loader/synthetic_data_loader.py | 2 +- dlio_benchmark/data_loader/tf_data_loader.py | 2 +- dlio_benchmark/data_loader/torch_data_loader.py | 2 +- dlio_benchmark/framework/tf_framework.py | 2 +- dlio_benchmark/framework/torch_framework.py | 2 +- dlio_benchmark/main.py | 2 +- dlio_benchmark/reader/csv_reader.py | 2 +- dlio_benchmark/reader/hdf5_reader.py | 2 +- dlio_benchmark/reader/image_reader.py | 2 +- dlio_benchmark/reader/indexed_binary_mmap_reader.py | 2 +- dlio_benchmark/reader/indexed_binary_reader.py | 2 +- dlio_benchmark/reader/npy_reader.py | 2 +- dlio_benchmark/reader/npz_reader.py | 2 +- dlio_benchmark/reader/reader_handler.py | 2 +- dlio_benchmark/reader/synthetic_reader.py | 2 +- dlio_benchmark/reader/tf_reader.py | 2 +- dlio_benchmark/storage/file_storage.py | 2 +- dlio_benchmark/storage/s3_storage.py | 2 +- dlio_benchmark/utils/config.py | 3 +-- dlio_benchmark/utils/utility.py | 3 ++- requirements.txt | 2 +- 32 files changed, 33 insertions(+), 33 deletions(-) diff --git a/dlio_benchmark/checkpointing/pytorch_checkpointing.py b/dlio_benchmark/checkpointing/pytorch_checkpointing.py index 156d3dc7..ba8436c3 100644 --- a/dlio_benchmark/checkpointing/pytorch_checkpointing.py +++ b/dlio_benchmark/checkpointing/pytorch_checkpointing.py @@ -18,7 +18,7 @@ import torch from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.common.constants import MODULE_CHECKPOINT from dlio_benchmark.common.enumerations import CheckpointLocationType diff --git a/dlio_benchmark/checkpointing/tf_checkpointing.py b/dlio_benchmark/checkpointing/tf_checkpointing.py index a69fe8fa..211f0376 100644 --- a/dlio_benchmark/checkpointing/tf_checkpointing.py +++ b/dlio_benchmark/checkpointing/tf_checkpointing.py @@ -17,7 +17,7 @@ import os from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile import tensorflow as tf from dlio_benchmark.common.constants import MODULE_CHECKPOINT diff --git a/dlio_benchmark/data_generator/hdf5_generator.py b/dlio_benchmark/data_generator/hdf5_generator.py index e5d33450..81911440 100644 --- a/dlio_benchmark/data_generator/hdf5_generator.py +++ b/dlio_benchmark/data_generator/hdf5_generator.py @@ -21,7 +21,7 @@ from dlio_benchmark.common.enumerations import Compression from dlio_benchmark.data_generator.data_generator import DataGenerator from dlio_benchmark.utils.utility import progress -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/indexed_binary_generator.py b/dlio_benchmark/data_generator/indexed_binary_generator.py index 790095af..6a7013b9 100644 --- a/dlio_benchmark/data_generator/indexed_binary_generator.py +++ b/dlio_benchmark/data_generator/indexed_binary_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow, DLIOMPI -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR import struct diff --git a/dlio_benchmark/data_generator/jpeg_generator.py b/dlio_benchmark/data_generator/jpeg_generator.py index 09b6f87d..3be0d360 100644 --- a/dlio_benchmark/data_generator/jpeg_generator.py +++ b/dlio_benchmark/data_generator/jpeg_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/npy_generator.py b/dlio_benchmark/data_generator/npy_generator.py index 45584193..8bf73033 100644 --- a/dlio_benchmark/data_generator/npy_generator.py +++ b/dlio_benchmark/data_generator/npy_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/npz_generator.py b/dlio_benchmark/data_generator/npz_generator.py index fb53559d..13ee8785 100644 --- a/dlio_benchmark/data_generator/npz_generator.py +++ b/dlio_benchmark/data_generator/npz_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/png_generator.py b/dlio_benchmark/data_generator/png_generator.py index 47874f3c..b34d5ab0 100644 --- a/dlio_benchmark/data_generator/png_generator.py +++ b/dlio_benchmark/data_generator/png_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/synthetic_generator.py b/dlio_benchmark/data_generator/synthetic_generator.py index be47f4b0..d8a54118 100644 --- a/dlio_benchmark/data_generator/synthetic_generator.py +++ b/dlio_benchmark/data_generator/synthetic_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/tf_generator.py b/dlio_benchmark/data_generator/tf_generator.py index 835240ec..b1151db6 100644 --- a/dlio_benchmark/data_generator/tf_generator.py +++ b/dlio_benchmark/data_generator/tf_generator.py @@ -20,7 +20,7 @@ from dlio_benchmark.data_generator.data_generator import DataGenerator import numpy as np import tensorflow as tf -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.utils.utility import progress, utcnow from shutil import copyfile diff --git a/dlio_benchmark/data_loader/dali_data_loader.py b/dlio_benchmark/data_loader/dali_data_loader.py index f10cd420..2dc12944 100644 --- a/dlio_benchmark/data_loader/dali_data_loader.py +++ b/dlio_benchmark/data_loader/dali_data_loader.py @@ -27,7 +27,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile import os dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/data_loader/synthetic_data_loader.py b/dlio_benchmark/data_loader/synthetic_data_loader.py index 828db569..32f0985d 100644 --- a/dlio_benchmark/data_loader/synthetic_data_loader.py +++ b/dlio_benchmark/data_loader/synthetic_data_loader.py @@ -27,7 +27,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile import os dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/data_loader/tf_data_loader.py b/dlio_benchmark/data_loader/tf_data_loader.py index 7d81d17a..162e6c1f 100644 --- a/dlio_benchmark/data_loader/tf_data_loader.py +++ b/dlio_benchmark/data_loader/tf_data_loader.py @@ -25,7 +25,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile import numpy as np diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py index bacd8ed7..e72559ef 100644 --- a/dlio_benchmark/data_loader/torch_data_loader.py +++ b/dlio_benchmark/data_loader/torch_data_loader.py @@ -29,7 +29,7 @@ from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow, DLIOMPI from dlio_benchmark.utils.config import ConfigArguments -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/framework/tf_framework.py b/dlio_benchmark/framework/tf_framework.py index 115b9077..1b1a049f 100644 --- a/dlio_benchmark/framework/tf_framework.py +++ b/dlio_benchmark/framework/tf_framework.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_AI_FRAMEWORK from dlio_benchmark.data_loader.data_loader_factory import DataLoaderFactory from dlio_benchmark.utils.utility import utcnow, DLIOMPI -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.common.error_code import ErrorCodes from dlio_benchmark.framework.framework import Framework from dlio_benchmark.reader.reader_factory import ReaderFactory diff --git a/dlio_benchmark/framework/torch_framework.py b/dlio_benchmark/framework/torch_framework.py index 2adddadf..1d4b85c0 100644 --- a/dlio_benchmark/framework/torch_framework.py +++ b/dlio_benchmark/framework/torch_framework.py @@ -25,7 +25,7 @@ import functools import logging from dlio_benchmark.utils.utility import utcnow, DLIOMPI -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from time import sleep, time diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py index 8dbfd6ed..8669b41c 100644 --- a/dlio_benchmark/main.py +++ b/dlio_benchmark/main.py @@ -46,7 +46,7 @@ from dlio_benchmark.framework.framework_factory import FrameworkFactory from dlio_benchmark.data_generator.generator_factory import GeneratorFactory from dlio_benchmark.storage.storage_factory import StorageFactory -from utils.utility import Profile, PerfTrace +from dlio_benchmark.utils.utility import Profile, PerfTrace dlp = Profile(MODULE_DLIO_BENCHMARK) diff --git a/dlio_benchmark/reader/csv_reader.py b/dlio_benchmark/reader/csv_reader.py index cf1d0ff1..797ce886 100644 --- a/dlio_benchmark/reader/csv_reader.py +++ b/dlio_benchmark/reader/csv_reader.py @@ -17,7 +17,7 @@ import pandas as pd from dlio_benchmark.common.constants import MODULE_DATA_READER -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.reader.reader_handler import FormatReader dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/hdf5_reader.py b/dlio_benchmark/reader/hdf5_reader.py index a5852dbd..6a1fc497 100644 --- a/dlio_benchmark/reader/hdf5_reader.py +++ b/dlio_benchmark/reader/hdf5_reader.py @@ -19,7 +19,7 @@ import h5py from dlio_benchmark.common.constants import MODULE_DATA_READER -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.reader.reader_handler import FormatReader dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/image_reader.py b/dlio_benchmark/reader/image_reader.py index 72fc0b88..6b3ef487 100644 --- a/dlio_benchmark/reader/image_reader.py +++ b/dlio_benchmark/reader/image_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader from dlio_benchmark.utils.utility import utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/indexed_binary_mmap_reader.py b/dlio_benchmark/reader/indexed_binary_mmap_reader.py index af19308c..bb86e4d3 100644 --- a/dlio_benchmark/reader/indexed_binary_mmap_reader.py +++ b/dlio_benchmark/reader/indexed_binary_mmap_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.common.enumerations import DataLoaderSampler from dlio_benchmark.reader.reader_handler import FormatReader -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/indexed_binary_reader.py b/dlio_benchmark/reader/indexed_binary_reader.py index 5cdf7b65..0eb50f03 100644 --- a/dlio_benchmark/reader/indexed_binary_reader.py +++ b/dlio_benchmark/reader/indexed_binary_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.common.enumerations import DataLoaderSampler from dlio_benchmark.reader.reader_handler import FormatReader -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/npy_reader.py b/dlio_benchmark/reader/npy_reader.py index d5ac63c6..bf4f57f5 100644 --- a/dlio_benchmark/reader/npy_reader.py +++ b/dlio_benchmark/reader/npy_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py index 2533f5c1..350d4baf 100644 --- a/dlio_benchmark/reader/npz_reader.py +++ b/dlio_benchmark/reader/npz_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index ea9d0ec2..fd6c295d 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -21,7 +21,7 @@ from dlio_benchmark.framework.framework_factory import FrameworkFactory from dlio_benchmark.storage.storage_factory import StorageFactory from dlio_benchmark.utils.utility import utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.utils.config import ConfigArguments import numpy as np import os diff --git a/dlio_benchmark/reader/synthetic_reader.py b/dlio_benchmark/reader/synthetic_reader.py index 58e9ab81..b2690dc5 100644 --- a/dlio_benchmark/reader/synthetic_reader.py +++ b/dlio_benchmark/reader/synthetic_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index 311ef7ac..0909cc8e 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -20,7 +20,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.utils.utility import utcnow -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.common.enumerations import DatasetType, Shuffle from dlio_benchmark.reader.reader_handler import FormatReader import tensorflow as tf diff --git a/dlio_benchmark/storage/file_storage.py b/dlio_benchmark/storage/file_storage.py index cc31d30c..b7b122a2 100644 --- a/dlio_benchmark/storage/file_storage.py +++ b/dlio_benchmark/storage/file_storage.py @@ -24,7 +24,7 @@ import glob import shutil -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_STORAGE) diff --git a/dlio_benchmark/storage/s3_storage.py b/dlio_benchmark/storage/s3_storage.py index 1cdb5fb3..c3e0637e 100644 --- a/dlio_benchmark/storage/s3_storage.py +++ b/dlio_benchmark/storage/s3_storage.py @@ -21,7 +21,7 @@ from dlio_benchmark.common.enumerations import NamespaceType, MetadataType import os -from utils.utility import Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_STORAGE) diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index b051f3e9..73042155 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -33,8 +33,7 @@ import math import os import numpy as np - -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile, DLIO_PROFILER_ENABLE +from dlio_benchmark.utils.utility import Profile, PerfTrace, DLIO_PROFILER_ENABLE dlp = Profile(MODULE_CONFIG) @dataclass diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py index 4a04956b..0396595b 100644 --- a/dlio_benchmark/utils/utility.py +++ b/dlio_benchmark/utils/utility.py @@ -34,7 +34,7 @@ # UTC timestamp format with microsecond precision from dlio_benchmark.common.enumerations import LoggerType, MPIState try: - from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile + from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile, DLIO_PROFILER_ENABLE except: class Profile: def __init__(self, name=None, cat=None): @@ -51,6 +51,7 @@ def initialize_log(self, logfile=None, data_dir=None, process_id=-1): def iter(self, a): return a PerfTrace = dlio_logger() + DLIO_PROFILER_ENABLE = False LOG_TS_FORMAT = "%Y-%m-%dT%H:%M:%S.%f" diff --git a/requirements.txt b/requirements.txt index 26c8828c..388b923f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ astunparse>=1.6.3 cachetools>=5.2.0 certifi>=2022.9.24 charset-normalizer>=2.1.1 -dlio_profiler_py==0.0.3 +#dlio_profiler_py==0.0.5 flatbuffers>=23.5.26 gast>=0.4.0 google-auth>=2.14.1 From 486406f5dae6a58504f3e4e606cad15b6d81e060 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 7 Jun 2024 03:31:04 -0500 Subject: [PATCH 11/19] Request changes from MLPerf Storage (#199) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs --- .../checkpointing/pytorch_checkpointing.py | 2 +- .../checkpointing/tf_checkpointing.py | 2 +- .../configs/workload/cosmoflow_a100.yaml | 6 ++++++ .../configs/workload/cosmoflow_h100.yaml | 8 +++++++- .../configs/workload/resnet50_a100.yaml | 5 ++++- .../configs/workload/resnet50_h100.yaml | 5 ++++- .../configs/workload/unet3d_a100.yaml | 3 +++ .../configs/workload/unet3d_h100.yaml | 3 +++ .../data_generator/hdf5_generator.py | 2 +- .../indexed_binary_generator.py | 2 +- .../data_generator/jpeg_generator.py | 2 +- .../data_generator/npy_generator.py | 2 +- .../data_generator/npz_generator.py | 2 +- .../data_generator/png_generator.py | 2 +- .../data_generator/synthetic_generator.py | 2 +- dlio_benchmark/data_generator/tf_generator.py | 2 +- .../data_loader/dali_data_loader.py | 2 +- .../data_loader/synthetic_data_loader.py | 2 +- dlio_benchmark/data_loader/tf_data_loader.py | 2 +- .../data_loader/torch_data_loader.py | 2 +- dlio_benchmark/framework/tf_framework.py | 2 +- dlio_benchmark/framework/torch_framework.py | 2 +- dlio_benchmark/main.py | 2 +- dlio_benchmark/reader/csv_reader.py | 2 +- dlio_benchmark/reader/hdf5_reader.py | 2 +- dlio_benchmark/reader/image_reader.py | 2 +- .../reader/indexed_binary_mmap_reader.py | 2 +- .../reader/indexed_binary_reader.py | 2 +- dlio_benchmark/reader/npy_reader.py | 2 +- dlio_benchmark/reader/npz_reader.py | 2 +- dlio_benchmark/reader/reader_handler.py | 2 +- dlio_benchmark/reader/synthetic_reader.py | 2 +- dlio_benchmark/reader/tf_reader.py | 2 +- dlio_benchmark/storage/file_storage.py | 2 +- dlio_benchmark/storage/s3_storage.py | 2 +- dlio_benchmark/utils/config.py | 8 ++++++-- dlio_benchmark/utils/statscounter.py | 4 ++-- dlio_benchmark/utils/utility.py | 19 +++++++++++++++++++ requirements.txt | 2 +- 39 files changed, 84 insertions(+), 37 deletions(-) diff --git a/dlio_benchmark/checkpointing/pytorch_checkpointing.py b/dlio_benchmark/checkpointing/pytorch_checkpointing.py index 6c52e733..ba8436c3 100644 --- a/dlio_benchmark/checkpointing/pytorch_checkpointing.py +++ b/dlio_benchmark/checkpointing/pytorch_checkpointing.py @@ -18,7 +18,7 @@ import torch from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.common.constants import MODULE_CHECKPOINT from dlio_benchmark.common.enumerations import CheckpointLocationType diff --git a/dlio_benchmark/checkpointing/tf_checkpointing.py b/dlio_benchmark/checkpointing/tf_checkpointing.py index 9cbc44c6..211f0376 100644 --- a/dlio_benchmark/checkpointing/tf_checkpointing.py +++ b/dlio_benchmark/checkpointing/tf_checkpointing.py @@ -17,7 +17,7 @@ import os from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile import tensorflow as tf from dlio_benchmark.common.constants import MODULE_CHECKPOINT diff --git a/dlio_benchmark/configs/workload/cosmoflow_a100.yaml b/dlio_benchmark/configs/workload/cosmoflow_a100.yaml index b7247890..28ea5987 100644 --- a/dlio_benchmark/configs/workload/cosmoflow_a100.yaml +++ b/dlio_benchmark/configs/workload/cosmoflow_a100.yaml @@ -18,7 +18,13 @@ reader: data_loader: tensorflow read_threads: 4 batch_size: 1 + file_shuffle: seed + sample_shuffle: seed + shuffle_size: 2 train: epochs: 5 computation_time: 0.00551 + +metric: + au: 0.70 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/cosmoflow_h100.yaml b/dlio_benchmark/configs/workload/cosmoflow_h100.yaml index c26a2a68..79f97772 100644 --- a/dlio_benchmark/configs/workload/cosmoflow_h100.yaml +++ b/dlio_benchmark/configs/workload/cosmoflow_h100.yaml @@ -18,7 +18,13 @@ reader: data_loader: tensorflow read_threads: 4 batch_size: 1 - + file_shuffle: seed + sample_shuffle: seed + shuffle_size: 2 + train: epochs: 5 computation_time: 0.00350 + +metric: + au: 0.70 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/resnet50_a100.yaml b/dlio_benchmark/configs/workload/resnet50_a100.yaml index 5dc66572..acfb2b1f 100644 --- a/dlio_benchmark/configs/workload/resnet50_a100.yaml +++ b/dlio_benchmark/configs/workload/resnet50_a100.yaml @@ -23,4 +23,7 @@ reader: read_threads: 8 computation_threads: 8 batch_size: 400 - dont_use_mmap: True \ No newline at end of file + dont_use_mmap: True + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/resnet50_h100.yaml b/dlio_benchmark/configs/workload/resnet50_h100.yaml index 92095f9f..ef009f1c 100644 --- a/dlio_benchmark/configs/workload/resnet50_h100.yaml +++ b/dlio_benchmark/configs/workload/resnet50_h100.yaml @@ -23,4 +23,7 @@ reader: read_threads: 8 computation_threads: 8 batch_size: 400 - dont_use_mmap: True \ No newline at end of file + + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/unet3d_a100.yaml b/dlio_benchmark/configs/workload/unet3d_a100.yaml index 97b73c99..e26ce454 100644 --- a/dlio_benchmark/configs/workload/unet3d_a100.yaml +++ b/dlio_benchmark/configs/workload/unet3d_a100.yaml @@ -32,3 +32,6 @@ checkpoint: checkpoint_after_epoch: 5 epochs_between_checkpoints: 2 model_size: 499153191 + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/unet3d_h100.yaml b/dlio_benchmark/configs/workload/unet3d_h100.yaml index 389f8d62..b9e4398f 100644 --- a/dlio_benchmark/configs/workload/unet3d_h100.yaml +++ b/dlio_benchmark/configs/workload/unet3d_h100.yaml @@ -32,3 +32,6 @@ checkpoint: checkpoint_after_epoch: 5 epochs_between_checkpoints: 2 model_size: 499153191 + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/data_generator/hdf5_generator.py b/dlio_benchmark/data_generator/hdf5_generator.py index ace59fae..81911440 100644 --- a/dlio_benchmark/data_generator/hdf5_generator.py +++ b/dlio_benchmark/data_generator/hdf5_generator.py @@ -21,7 +21,7 @@ from dlio_benchmark.common.enumerations import Compression from dlio_benchmark.data_generator.data_generator import DataGenerator from dlio_benchmark.utils.utility import progress -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/indexed_binary_generator.py b/dlio_benchmark/data_generator/indexed_binary_generator.py index 7f16032e..6a7013b9 100644 --- a/dlio_benchmark/data_generator/indexed_binary_generator.py +++ b/dlio_benchmark/data_generator/indexed_binary_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow, DLIOMPI -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR import struct diff --git a/dlio_benchmark/data_generator/jpeg_generator.py b/dlio_benchmark/data_generator/jpeg_generator.py index fdf21e4c..3be0d360 100644 --- a/dlio_benchmark/data_generator/jpeg_generator.py +++ b/dlio_benchmark/data_generator/jpeg_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/npy_generator.py b/dlio_benchmark/data_generator/npy_generator.py index de9f27bb..8bf73033 100644 --- a/dlio_benchmark/data_generator/npy_generator.py +++ b/dlio_benchmark/data_generator/npy_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/npz_generator.py b/dlio_benchmark/data_generator/npz_generator.py index 74ecdd1e..13ee8785 100644 --- a/dlio_benchmark/data_generator/npz_generator.py +++ b/dlio_benchmark/data_generator/npz_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/png_generator.py b/dlio_benchmark/data_generator/png_generator.py index 48343bbf..b34d5ab0 100644 --- a/dlio_benchmark/data_generator/png_generator.py +++ b/dlio_benchmark/data_generator/png_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/synthetic_generator.py b/dlio_benchmark/data_generator/synthetic_generator.py index 5135d7f9..d8a54118 100644 --- a/dlio_benchmark/data_generator/synthetic_generator.py +++ b/dlio_benchmark/data_generator/synthetic_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/tf_generator.py b/dlio_benchmark/data_generator/tf_generator.py index c4da8616..b1151db6 100644 --- a/dlio_benchmark/data_generator/tf_generator.py +++ b/dlio_benchmark/data_generator/tf_generator.py @@ -20,7 +20,7 @@ from dlio_benchmark.data_generator.data_generator import DataGenerator import numpy as np import tensorflow as tf -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.utils.utility import progress, utcnow from shutil import copyfile diff --git a/dlio_benchmark/data_loader/dali_data_loader.py b/dlio_benchmark/data_loader/dali_data_loader.py index d7deb921..2dc12944 100644 --- a/dlio_benchmark/data_loader/dali_data_loader.py +++ b/dlio_benchmark/data_loader/dali_data_loader.py @@ -27,7 +27,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile import os dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/data_loader/synthetic_data_loader.py b/dlio_benchmark/data_loader/synthetic_data_loader.py index ce54b54e..32f0985d 100644 --- a/dlio_benchmark/data_loader/synthetic_data_loader.py +++ b/dlio_benchmark/data_loader/synthetic_data_loader.py @@ -27,7 +27,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile import os dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/data_loader/tf_data_loader.py b/dlio_benchmark/data_loader/tf_data_loader.py index edf01d30..162e6c1f 100644 --- a/dlio_benchmark/data_loader/tf_data_loader.py +++ b/dlio_benchmark/data_loader/tf_data_loader.py @@ -25,7 +25,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile import numpy as np diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py index e2ae4f95..e72559ef 100644 --- a/dlio_benchmark/data_loader/torch_data_loader.py +++ b/dlio_benchmark/data_loader/torch_data_loader.py @@ -29,7 +29,7 @@ from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow, DLIOMPI from dlio_benchmark.utils.config import ConfigArguments -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/framework/tf_framework.py b/dlio_benchmark/framework/tf_framework.py index 2e21e151..1b1a049f 100644 --- a/dlio_benchmark/framework/tf_framework.py +++ b/dlio_benchmark/framework/tf_framework.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_AI_FRAMEWORK from dlio_benchmark.data_loader.data_loader_factory import DataLoaderFactory from dlio_benchmark.utils.utility import utcnow, DLIOMPI -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.common.error_code import ErrorCodes from dlio_benchmark.framework.framework import Framework from dlio_benchmark.reader.reader_factory import ReaderFactory diff --git a/dlio_benchmark/framework/torch_framework.py b/dlio_benchmark/framework/torch_framework.py index 8660914c..1d4b85c0 100644 --- a/dlio_benchmark/framework/torch_framework.py +++ b/dlio_benchmark/framework/torch_framework.py @@ -25,7 +25,7 @@ import functools import logging from dlio_benchmark.utils.utility import utcnow, DLIOMPI -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from time import sleep, time diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py index ebeb5a9f..8669b41c 100644 --- a/dlio_benchmark/main.py +++ b/dlio_benchmark/main.py @@ -46,7 +46,7 @@ from dlio_benchmark.framework.framework_factory import FrameworkFactory from dlio_benchmark.data_generator.generator_factory import GeneratorFactory from dlio_benchmark.storage.storage_factory import StorageFactory -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile, PerfTrace dlp = Profile(MODULE_DLIO_BENCHMARK) diff --git a/dlio_benchmark/reader/csv_reader.py b/dlio_benchmark/reader/csv_reader.py index 75ca7577..797ce886 100644 --- a/dlio_benchmark/reader/csv_reader.py +++ b/dlio_benchmark/reader/csv_reader.py @@ -17,7 +17,7 @@ import pandas as pd from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.reader.reader_handler import FormatReader dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/hdf5_reader.py b/dlio_benchmark/reader/hdf5_reader.py index c2ffe7ec..6a1fc497 100644 --- a/dlio_benchmark/reader/hdf5_reader.py +++ b/dlio_benchmark/reader/hdf5_reader.py @@ -19,7 +19,7 @@ import h5py from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.reader.reader_handler import FormatReader dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/image_reader.py b/dlio_benchmark/reader/image_reader.py index 6466ad5f..6b3ef487 100644 --- a/dlio_benchmark/reader/image_reader.py +++ b/dlio_benchmark/reader/image_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/indexed_binary_mmap_reader.py b/dlio_benchmark/reader/indexed_binary_mmap_reader.py index 7dce843c..bb86e4d3 100644 --- a/dlio_benchmark/reader/indexed_binary_mmap_reader.py +++ b/dlio_benchmark/reader/indexed_binary_mmap_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.common.enumerations import DataLoaderSampler from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/indexed_binary_reader.py b/dlio_benchmark/reader/indexed_binary_reader.py index 905ec337..0eb50f03 100644 --- a/dlio_benchmark/reader/indexed_binary_reader.py +++ b/dlio_benchmark/reader/indexed_binary_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.common.enumerations import DataLoaderSampler from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/npy_reader.py b/dlio_benchmark/reader/npy_reader.py index 73955a4d..bf4f57f5 100644 --- a/dlio_benchmark/reader/npy_reader.py +++ b/dlio_benchmark/reader/npy_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py index 685a1815..350d4baf 100644 --- a/dlio_benchmark/reader/npz_reader.py +++ b/dlio_benchmark/reader/npz_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index ff459cf2..fd6c295d 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -21,7 +21,7 @@ from dlio_benchmark.framework.framework_factory import FrameworkFactory from dlio_benchmark.storage.storage_factory import StorageFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.utils.config import ConfigArguments import numpy as np import os diff --git a/dlio_benchmark/reader/synthetic_reader.py b/dlio_benchmark/reader/synthetic_reader.py index 5114a0ce..b2690dc5 100644 --- a/dlio_benchmark/reader/synthetic_reader.py +++ b/dlio_benchmark/reader/synthetic_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index bc98d523..0909cc8e 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -20,7 +20,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.common.enumerations import DatasetType, Shuffle from dlio_benchmark.reader.reader_handler import FormatReader import tensorflow as tf diff --git a/dlio_benchmark/storage/file_storage.py b/dlio_benchmark/storage/file_storage.py index 26ae8a0b..b7b122a2 100644 --- a/dlio_benchmark/storage/file_storage.py +++ b/dlio_benchmark/storage/file_storage.py @@ -24,7 +24,7 @@ import glob import shutil -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_STORAGE) diff --git a/dlio_benchmark/storage/s3_storage.py b/dlio_benchmark/storage/s3_storage.py index 8fbaeeda..c3e0637e 100644 --- a/dlio_benchmark/storage/s3_storage.py +++ b/dlio_benchmark/storage/s3_storage.py @@ -21,7 +21,7 @@ from dlio_benchmark.common.enumerations import NamespaceType, MetadataType import os -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_STORAGE) diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index 088ce149..73042155 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -33,8 +33,7 @@ import math import os import numpy as np - -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile, DLIO_PROFILER_ENABLE +from dlio_benchmark.utils.utility import Profile, PerfTrace, DLIO_PROFILER_ENABLE dlp = Profile(MODULE_CONFIG) @dataclass @@ -133,6 +132,7 @@ class ConfigArguments: training_steps: int = 0 eval_steps: int = 0 samples_per_thread: int = 1 + au: float = 0.90 file_map = None global_index_map = None data_loader_class = None @@ -574,3 +574,7 @@ def LoadConfig(args, config): args.iostat_devices = config['profiling']['iostat_devices'] if isinstance(args.iostat_devices, str): args.iostat_devices = [args.iostat_devices] + + if 'metric' in config: + if 'au' in config['metric']: + args.au = config['metric']['au'] diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py index 8ba652e8..5d2c8823 100644 --- a/dlio_benchmark/utils/statscounter.py +++ b/dlio_benchmark/utils/statscounter.py @@ -142,7 +142,7 @@ def end_run(self): self.summary['epochs'] = len(train_au) self.summary['metric']['train_au_percentage'] = list(train_au) self.summary['metric']['train_au_mean_percentage'] = np.mean(train_au) - if self.summary['metric']['train_au_mean_percentage'] >=90: + if self.summary['metric']['train_au_mean_percentage'] >=self.args.au*100: self.summary['metric']['train_au_meet_expectation'] = 'success' else: self.summary['metric']['train_au_meet_expectation'] = 'fail' @@ -157,7 +157,7 @@ def end_run(self): eval_throughput = self.comm.allreduce(self.eval_throughput) self.summary['metric']['eval_au_percentage'] = list(eval_au) self.summary['metric']['eval_au_mean_percentage'] = np.mean(eval_au) - if self.summary['metric']['eval_au_mean_percentage'] >=90: + if self.summary['metric']['eval_au_mean_percentage'] >=self.args.au*100: self.summary['metric']['eval_au_meet_expectation'] = 'success' else: self.summary['metric']['eval_au_meet_expectation'] = 'fail' diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py index f760255f..0396595b 100644 --- a/dlio_benchmark/utils/utility.py +++ b/dlio_benchmark/utils/utility.py @@ -33,6 +33,25 @@ import importlib.util # UTC timestamp format with microsecond precision from dlio_benchmark.common.enumerations import LoggerType, MPIState +try: + from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile, DLIO_PROFILER_ENABLE +except: + class Profile: + def __init__(self, name=None, cat=None): + self.type = type + def log(self, func): + return func + def iter(self, a): + return a + class dlio_logger: + def __init__(self,): + self.type = None + def initialize_log(self, logfile=None, data_dir=None, process_id=-1): + return + def iter(self, a): + return a + PerfTrace = dlio_logger() + DLIO_PROFILER_ENABLE = False LOG_TS_FORMAT = "%Y-%m-%dT%H:%M:%S.%f" diff --git a/requirements.txt b/requirements.txt index 26c8828c..388b923f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ astunparse>=1.6.3 cachetools>=5.2.0 certifi>=2022.9.24 charset-normalizer>=2.1.1 -dlio_profiler_py==0.0.3 +#dlio_profiler_py==0.0.5 flatbuffers>=23.5.26 gast>=0.4.0 google-auth>=2.14.1 From 581a4172bd26f676982109b07c808ae34a628f5f Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 7 Jun 2024 08:18:54 -0500 Subject: [PATCH 12/19] recovered back dlio_profiler --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 388b923f..6def223f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ astunparse>=1.6.3 cachetools>=5.2.0 certifi>=2022.9.24 charset-normalizer>=2.1.1 -#dlio_profiler_py==0.0.5 +dlio_profiler_py==0.0.5 flatbuffers>=23.5.26 gast>=0.4.0 google-auth>=2.14.1 From f07d392eb00667976e77b5255e0037c63ded2082 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 7 Jun 2024 08:27:17 -0500 Subject: [PATCH 13/19] fixed potential not enough samples --- dlio_benchmark/reader/tf_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index 0909cc8e..cb6f976f 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -106,8 +106,8 @@ def next(self): lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]), num_parallel_calls=self._args.computation_threads) - self._dataset = self._dataset.repeat(self._args.epochs) - total = math.ceil(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file) + self._dataset = self._dataset.repeat() + total = math.float(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file) return self._dataset.take(total*self._args.epochs).prefetch(buffer_size=self._args.prefetch_size) @dlp.log def read_index(self, image_idx, step): From 71e2cfa09bb866ca619a6792cf4f798a4205eea5 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 7 Jun 2024 11:33:08 -0500 Subject: [PATCH 14/19] Fixed potential insufficient samples due to num_files is not divisible by comm.size (#200) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * recovered back dlio_profiler * fixed potential not enough samples * Update tf_reader.py --- dlio_benchmark/reader/tf_reader.py | 4 ++-- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index 0909cc8e..ce37b925 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -106,8 +106,8 @@ def next(self): lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]), num_parallel_calls=self._args.computation_threads) - self._dataset = self._dataset.repeat(self._args.epochs) - total = math.ceil(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file) + self._dataset = self._dataset.repeat() + total = math.floor(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file) return self._dataset.take(total*self._args.epochs).prefetch(buffer_size=self._args.prefetch_size) @dlp.log def read_index(self, image_idx, step): diff --git a/requirements.txt b/requirements.txt index 388b923f..6def223f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ astunparse>=1.6.3 cachetools>=5.2.0 certifi>=2022.9.24 charset-normalizer>=2.1.1 -#dlio_profiler_py==0.0.5 +dlio_profiler_py==0.0.5 flatbuffers>=23.5.26 gast>=0.4.0 google-auth>=2.14.1 From 67c5d3195dc6e9a08ed52363a15df810c291387a Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 7 Jun 2024 11:44:16 -0500 Subject: [PATCH 15/19] Mlperf requests (#201) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * fixed issue with dlio_profiler * bring back dlio_profiler_py --- dlio_benchmark/utils/utility.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py index 0396595b..ed5fbd5e 100644 --- a/dlio_benchmark/utils/utility.py +++ b/dlio_benchmark/utils/utility.py @@ -36,14 +36,22 @@ try: from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile, DLIO_PROFILER_ENABLE except: - class Profile: + class Profile(object): def __init__(self, name=None, cat=None): self.type = type def log(self, func): return func + def log_init(self, func): + return func def iter(self, a): return a - class dlio_logger: + def __enter__(self): + return + def __exit__(self, type, value, traceback): + return + def update(self, *, epoch=0, step=0, size=0, default=None): + return + class dlio_logger(object): def __init__(self,): self.type = None def initialize_log(self, logfile=None, data_dir=None, process_id=-1): From 27e63f338757bc4d9a2ad9499bde7e169d4ac2f3 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 7 Jun 2024 11:46:41 -0500 Subject: [PATCH 16/19] Update tf_reader.py fixed typo --- dlio_benchmark/reader/tf_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index cb6f976f..ce37b925 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -107,7 +107,7 @@ def next(self): num_parallel_calls=self._args.computation_threads) self._dataset = self._dataset.repeat() - total = math.float(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file) + total = math.floor(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file) return self._dataset.take(total*self._args.epochs).prefetch(buffer_size=self._args.prefetch_size) @dlp.log def read_index(self, image_idx, step): From cef8f9c5a54240f63e86d56b615596f56572ce58 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 7 Jun 2024 22:19:11 -0500 Subject: [PATCH 17/19] Bring v1.0 to the most recent commit (#202) (#203) * Request changes from MLPerf Storage (#199) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * Fixed potential insufficient samples due to num_files is not divisible by comm.size (#200) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * recovered back dlio_profiler * fixed potential not enough samples * Update tf_reader.py * Mlperf requests (#201) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * fixed issue with dlio_profiler * bring back dlio_profiler_py From 05c8b13f54fda7e6c4759b478125f9f05ca2b92f Mon Sep 17 00:00:00 2001 From: Johnu George Date: Tue, 11 Jun 2024 22:13:10 +0530 Subject: [PATCH 18/19] Fix requirements file (#204) Signed-off-by: Johnu George --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 6def223f..537a6552 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ Markdown>=3.4.1 MarkupSafe>=2.1.1 mpi4py>=3.1.4 numpy>=1.23.5 ---extra-index-url https://pypi.nvidia.com nvidia-dali-cuda110>=1.34.0 +nvidia-dali-cuda110>=1.34.0 oauthlib>=3.2.2 omegaconf>=2.2.3 opt-einsum>=3.3.0 From 01283ab2b2e04af6ad5b8f540c066b5959de551a Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 11 Jun 2024 12:08:56 -0500 Subject: [PATCH 19/19] Mlperf storage v1.0 (#206) * Bring v1.0 to the most recent commit (#202) * Request changes from MLPerf Storage (#199) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * Fixed potential insufficient samples due to num_files is not divisible by comm.size (#200) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * recovered back dlio_profiler * fixed potential not enough samples * Update tf_reader.py * Mlperf requests (#201) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * fixed issue with dlio_profiler * bring back dlio_profiler_py * sync up (#205) * Request changes from MLPerf Storage (#199) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * Fixed potential insufficient samples due to num_files is not divisible by comm.size (#200) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * recovered back dlio_profiler * fixed potential not enough samples * Update tf_reader.py * Mlperf requests (#201) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * fixed issue with dlio_profiler * bring back dlio_profiler_py * Bring v1.0 to the most recent commit (#202) (#203) * Request changes from MLPerf Storage (#199) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * Fixed potential insufficient samples due to num_files is not divisible by comm.size (#200) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * recovered back dlio_profiler * fixed potential not enough samples * Update tf_reader.py * Mlperf requests (#201) * added au metric to the configuration file; set shuffling and shuffle buffer size to be 2 for cosmoflow * removed dependencies on dlioprofiler * fixed bugs * fixed issue with dlio_profiler * bring back dlio_profiler_py * Fix requirements file (#204) Signed-off-by: Johnu George --------- Signed-off-by: Johnu George Co-authored-by: Johnu George * barrier in the beginning --------- Signed-off-by: Johnu George Co-authored-by: Johnu George --- dlio_benchmark/main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py index 8669b41c..0d56bb20 100644 --- a/dlio_benchmark/main.py +++ b/dlio_benchmark/main.py @@ -16,7 +16,7 @@ """ import os import math -import hydra + import logging from time import time, sleep import json @@ -49,7 +49,10 @@ from dlio_benchmark.utils.utility import Profile, PerfTrace dlp = Profile(MODULE_DLIO_BENCHMARK) - +from mpi4py import MPI +# To make sure the output folder is the same in all the nodes. We have to do this. +MPI.COMM_WORLD.Barrier() +import hydra class DLIOBenchmark(object): """