diff --git a/dlio_benchmark/checkpointing/pytorch_checkpointing.py b/dlio_benchmark/checkpointing/pytorch_checkpointing.py index 6c52e733..ba8436c3 100644 --- a/dlio_benchmark/checkpointing/pytorch_checkpointing.py +++ b/dlio_benchmark/checkpointing/pytorch_checkpointing.py @@ -18,7 +18,7 @@ import torch from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.common.constants import MODULE_CHECKPOINT from dlio_benchmark.common.enumerations import CheckpointLocationType diff --git a/dlio_benchmark/checkpointing/tf_checkpointing.py b/dlio_benchmark/checkpointing/tf_checkpointing.py index 9cbc44c6..211f0376 100644 --- a/dlio_benchmark/checkpointing/tf_checkpointing.py +++ b/dlio_benchmark/checkpointing/tf_checkpointing.py @@ -17,7 +17,7 @@ import os from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile import tensorflow as tf from dlio_benchmark.common.constants import MODULE_CHECKPOINT diff --git a/dlio_benchmark/configs/workload/cosmoflow_a100.yaml b/dlio_benchmark/configs/workload/cosmoflow_a100.yaml index b7247890..28ea5987 100644 --- a/dlio_benchmark/configs/workload/cosmoflow_a100.yaml +++ b/dlio_benchmark/configs/workload/cosmoflow_a100.yaml @@ -18,7 +18,13 @@ reader: data_loader: tensorflow read_threads: 4 batch_size: 1 + file_shuffle: seed + sample_shuffle: seed + shuffle_size: 2 train: epochs: 5 computation_time: 0.00551 + +metric: + au: 0.70 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/cosmoflow_h100.yaml b/dlio_benchmark/configs/workload/cosmoflow_h100.yaml index c26a2a68..79f97772 100644 --- a/dlio_benchmark/configs/workload/cosmoflow_h100.yaml +++ b/dlio_benchmark/configs/workload/cosmoflow_h100.yaml @@ -18,7 +18,13 @@ reader: data_loader: tensorflow read_threads: 4 batch_size: 1 - + file_shuffle: seed + sample_shuffle: seed + shuffle_size: 2 + train: epochs: 5 computation_time: 0.00350 + +metric: + au: 0.70 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/resnet50_a100.yaml b/dlio_benchmark/configs/workload/resnet50_a100.yaml index 5dc66572..acfb2b1f 100644 --- a/dlio_benchmark/configs/workload/resnet50_a100.yaml +++ b/dlio_benchmark/configs/workload/resnet50_a100.yaml @@ -23,4 +23,7 @@ reader: read_threads: 8 computation_threads: 8 batch_size: 400 - dont_use_mmap: True \ No newline at end of file + dont_use_mmap: True + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/resnet50_h100.yaml b/dlio_benchmark/configs/workload/resnet50_h100.yaml index 92095f9f..ef009f1c 100644 --- a/dlio_benchmark/configs/workload/resnet50_h100.yaml +++ b/dlio_benchmark/configs/workload/resnet50_h100.yaml @@ -23,4 +23,7 @@ reader: read_threads: 8 computation_threads: 8 batch_size: 400 - dont_use_mmap: True \ No newline at end of file + + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/unet3d_a100.yaml b/dlio_benchmark/configs/workload/unet3d_a100.yaml index 97b73c99..e26ce454 100644 --- a/dlio_benchmark/configs/workload/unet3d_a100.yaml +++ b/dlio_benchmark/configs/workload/unet3d_a100.yaml @@ -32,3 +32,6 @@ checkpoint: checkpoint_after_epoch: 5 epochs_between_checkpoints: 2 model_size: 499153191 + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/configs/workload/unet3d_h100.yaml b/dlio_benchmark/configs/workload/unet3d_h100.yaml index 389f8d62..b9e4398f 100644 --- a/dlio_benchmark/configs/workload/unet3d_h100.yaml +++ b/dlio_benchmark/configs/workload/unet3d_h100.yaml @@ -32,3 +32,6 @@ checkpoint: checkpoint_after_epoch: 5 epochs_between_checkpoints: 2 model_size: 499153191 + +metric: + au: 0.90 \ No newline at end of file diff --git a/dlio_benchmark/data_generator/hdf5_generator.py b/dlio_benchmark/data_generator/hdf5_generator.py index ace59fae..81911440 100644 --- a/dlio_benchmark/data_generator/hdf5_generator.py +++ b/dlio_benchmark/data_generator/hdf5_generator.py @@ -21,7 +21,7 @@ from dlio_benchmark.common.enumerations import Compression from dlio_benchmark.data_generator.data_generator import DataGenerator from dlio_benchmark.utils.utility import progress -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/indexed_binary_generator.py b/dlio_benchmark/data_generator/indexed_binary_generator.py index 7f16032e..6a7013b9 100644 --- a/dlio_benchmark/data_generator/indexed_binary_generator.py +++ b/dlio_benchmark/data_generator/indexed_binary_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow, DLIOMPI -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR import struct diff --git a/dlio_benchmark/data_generator/jpeg_generator.py b/dlio_benchmark/data_generator/jpeg_generator.py index fdf21e4c..3be0d360 100644 --- a/dlio_benchmark/data_generator/jpeg_generator.py +++ b/dlio_benchmark/data_generator/jpeg_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/npy_generator.py b/dlio_benchmark/data_generator/npy_generator.py index de9f27bb..8bf73033 100644 --- a/dlio_benchmark/data_generator/npy_generator.py +++ b/dlio_benchmark/data_generator/npy_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/npz_generator.py b/dlio_benchmark/data_generator/npz_generator.py index 74ecdd1e..13ee8785 100644 --- a/dlio_benchmark/data_generator/npz_generator.py +++ b/dlio_benchmark/data_generator/npz_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/png_generator.py b/dlio_benchmark/data_generator/png_generator.py index 48343bbf..b34d5ab0 100644 --- a/dlio_benchmark/data_generator/png_generator.py +++ b/dlio_benchmark/data_generator/png_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/synthetic_generator.py b/dlio_benchmark/data_generator/synthetic_generator.py index 5135d7f9..d8a54118 100644 --- a/dlio_benchmark/data_generator/synthetic_generator.py +++ b/dlio_benchmark/data_generator/synthetic_generator.py @@ -22,7 +22,7 @@ import numpy as np from dlio_benchmark.utils.utility import progress, utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from shutil import copyfile import PIL.Image as im from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR diff --git a/dlio_benchmark/data_generator/tf_generator.py b/dlio_benchmark/data_generator/tf_generator.py index c4da8616..b1151db6 100644 --- a/dlio_benchmark/data_generator/tf_generator.py +++ b/dlio_benchmark/data_generator/tf_generator.py @@ -20,7 +20,7 @@ from dlio_benchmark.data_generator.data_generator import DataGenerator import numpy as np import tensorflow as tf -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.utils.utility import progress, utcnow from shutil import copyfile diff --git a/dlio_benchmark/data_loader/dali_data_loader.py b/dlio_benchmark/data_loader/dali_data_loader.py index d7deb921..2dc12944 100644 --- a/dlio_benchmark/data_loader/dali_data_loader.py +++ b/dlio_benchmark/data_loader/dali_data_loader.py @@ -27,7 +27,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile import os dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/data_loader/synthetic_data_loader.py b/dlio_benchmark/data_loader/synthetic_data_loader.py index ab6796cc..e38bb70e 100644 --- a/dlio_benchmark/data_loader/synthetic_data_loader.py +++ b/dlio_benchmark/data_loader/synthetic_data_loader.py @@ -27,7 +27,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile import os dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/data_loader/tf_data_loader.py b/dlio_benchmark/data_loader/tf_data_loader.py index edf01d30..162e6c1f 100644 --- a/dlio_benchmark/data_loader/tf_data_loader.py +++ b/dlio_benchmark/data_loader/tf_data_loader.py @@ -25,7 +25,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile import numpy as np diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py index e2ae4f95..e72559ef 100644 --- a/dlio_benchmark/data_loader/torch_data_loader.py +++ b/dlio_benchmark/data_loader/torch_data_loader.py @@ -29,7 +29,7 @@ from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow, DLIOMPI from dlio_benchmark.utils.config import ConfigArguments -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/framework/tf_framework.py b/dlio_benchmark/framework/tf_framework.py index f618b4fa..b910e9ef 100644 --- a/dlio_benchmark/framework/tf_framework.py +++ b/dlio_benchmark/framework/tf_framework.py @@ -21,7 +21,7 @@ from dlio_benchmark.common.constants import MODULE_AI_FRAMEWORK from dlio_benchmark.data_loader.data_loader_factory import DataLoaderFactory from dlio_benchmark.utils.utility import utcnow, DLIOMPI -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.common.error_code import ErrorCodes from dlio_benchmark.framework.framework import Framework from dlio_benchmark.reader.reader_factory import ReaderFactory diff --git a/dlio_benchmark/framework/torch_framework.py b/dlio_benchmark/framework/torch_framework.py index 6022a3f1..d61daf4d 100644 --- a/dlio_benchmark/framework/torch_framework.py +++ b/dlio_benchmark/framework/torch_framework.py @@ -25,7 +25,7 @@ import functools import logging from dlio_benchmark.utils.utility import utcnow, DLIOMPI -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from time import sleep, time diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py index 885b1cec..2ce31710 100644 --- a/dlio_benchmark/main.py +++ b/dlio_benchmark/main.py @@ -16,7 +16,7 @@ """ import os import math -import hydra + import logging from time import time, sleep import json @@ -46,10 +46,13 @@ from dlio_benchmark.framework.framework_factory import FrameworkFactory from dlio_benchmark.data_generator.generator_factory import GeneratorFactory from dlio_benchmark.storage.storage_factory import StorageFactory -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile, PerfTrace dlp = Profile(MODULE_DLIO_BENCHMARK) - +from mpi4py import MPI +# To make sure the output folder is the same in all the nodes. We have to do this. +MPI.COMM_WORLD.Barrier() +import hydra class DLIOBenchmark(object): """ diff --git a/dlio_benchmark/reader/csv_reader.py b/dlio_benchmark/reader/csv_reader.py index 75ca7577..797ce886 100644 --- a/dlio_benchmark/reader/csv_reader.py +++ b/dlio_benchmark/reader/csv_reader.py @@ -17,7 +17,7 @@ import pandas as pd from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.reader.reader_handler import FormatReader dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/hdf5_reader.py b/dlio_benchmark/reader/hdf5_reader.py index c2ffe7ec..6a1fc497 100644 --- a/dlio_benchmark/reader/hdf5_reader.py +++ b/dlio_benchmark/reader/hdf5_reader.py @@ -19,7 +19,7 @@ import h5py from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.reader.reader_handler import FormatReader dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/image_reader.py b/dlio_benchmark/reader/image_reader.py index 6466ad5f..6b3ef487 100644 --- a/dlio_benchmark/reader/image_reader.py +++ b/dlio_benchmark/reader/image_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/indexed_binary_mmap_reader.py b/dlio_benchmark/reader/indexed_binary_mmap_reader.py index 7dce843c..bb86e4d3 100644 --- a/dlio_benchmark/reader/indexed_binary_mmap_reader.py +++ b/dlio_benchmark/reader/indexed_binary_mmap_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.common.enumerations import DataLoaderSampler from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/indexed_binary_reader.py b/dlio_benchmark/reader/indexed_binary_reader.py index 905ec337..0eb50f03 100644 --- a/dlio_benchmark/reader/indexed_binary_reader.py +++ b/dlio_benchmark/reader/indexed_binary_reader.py @@ -22,7 +22,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.common.enumerations import DataLoaderSampler from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/npy_reader.py b/dlio_benchmark/reader/npy_reader.py index 73955a4d..bf4f57f5 100644 --- a/dlio_benchmark/reader/npy_reader.py +++ b/dlio_benchmark/reader/npy_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py index 685a1815..350d4baf 100644 --- a/dlio_benchmark/reader/npz_reader.py +++ b/dlio_benchmark/reader/npz_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index ff459cf2..fd6c295d 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -21,7 +21,7 @@ from dlio_benchmark.framework.framework_factory import FrameworkFactory from dlio_benchmark.storage.storage_factory import StorageFactory from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.utils.config import ConfigArguments import numpy as np import os diff --git a/dlio_benchmark/reader/synthetic_reader.py b/dlio_benchmark/reader/synthetic_reader.py index 5114a0ce..b2690dc5 100644 --- a/dlio_benchmark/reader/synthetic_reader.py +++ b/dlio_benchmark/reader/synthetic_reader.py @@ -18,7 +18,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index 493cde2d..f44d5c18 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -20,7 +20,7 @@ from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.utils.utility import utcnow -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile from dlio_benchmark.common.enumerations import DatasetType, Shuffle from dlio_benchmark.reader.reader_handler import FormatReader import tensorflow as tf @@ -81,35 +81,33 @@ def _parse_image(self, serialized): @dlp.log def next(self): - logging.debug( - f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}") - self._dataset = tf.data.TFRecordDataset(filenames=self._file_list, buffer_size=self._args.transfer_size, + logging.debug(f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}") + filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True) + # sharding in the file list if we have enought files. + if (len(self._file_list) >= self._args.comm_size): + filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + + self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size, num_parallel_reads=self._args.read_threads) - + if self._args.sample_shuffle != Shuffle.OFF: if self._args.sample_shuffle == Shuffle.SEED: self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size, seed=self._args.seed) else: self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size) - - self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) - if self._args.computation_threads==0: - self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True) - else: - if self._args.computation_threads <= self.batch_size: - self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True) - self._dataset = self._dataset.map( - lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]), - num_parallel_calls=self._args.computation_threads) - else: - self._dataset = self._dataset.batch(self._args.computation_threads) - self._dataset = self._dataset.map( - lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]), - num_parallel_calls=self._args.computation_threads) - self._dataset = self._dataset.unbatch(self.batch_size) - self._dataset = self._dataset.repeat(self._args.epochs) - total = math.ceil(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file) + + # shard the dataset if it is not done already. + if (len(self._file_list) < self._args.comm_size): + self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) + + self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True) + self._dataset = self._dataset.map( + lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]), + num_parallel_calls=self._args.computation_threads) + + self._dataset = self._dataset.repeat() + total = math.floor(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file) return self._dataset.take(total*self._args.epochs).prefetch(buffer_size=self._args.prefetch_size) @dlp.log diff --git a/dlio_benchmark/storage/file_storage.py b/dlio_benchmark/storage/file_storage.py index 26ae8a0b..b7b122a2 100644 --- a/dlio_benchmark/storage/file_storage.py +++ b/dlio_benchmark/storage/file_storage.py @@ -24,7 +24,7 @@ import glob import shutil -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_STORAGE) diff --git a/dlio_benchmark/storage/s3_storage.py b/dlio_benchmark/storage/s3_storage.py index 8fbaeeda..c3e0637e 100644 --- a/dlio_benchmark/storage/s3_storage.py +++ b/dlio_benchmark/storage/s3_storage.py @@ -21,7 +21,7 @@ from dlio_benchmark.common.enumerations import NamespaceType, MetadataType import os -from dlio_profiler.logger import fn_interceptor as Profile +from dlio_benchmark.utils.utility import Profile dlp = Profile(MODULE_STORAGE) diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index d0929a91..52bb88b8 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -33,8 +33,7 @@ import math import os import numpy as np - -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile, DLIO_PROFILER_ENABLE +from dlio_benchmark.utils.utility import Profile, PerfTrace, DLIO_PROFILER_ENABLE dlp = Profile(MODULE_CONFIG) @dataclass @@ -133,6 +132,7 @@ class ConfigArguments: training_steps: int = 0 eval_steps: int = 0 samples_per_thread: int = 1 + au: float = 0.90 file_map = None global_index_map = None data_loader_class = None @@ -572,3 +572,7 @@ def LoadConfig(args, config): args.iostat_devices = config['profiling']['iostat_devices'] if isinstance(args.iostat_devices, str): args.iostat_devices = [args.iostat_devices] + + if 'metric' in config: + if 'au' in config['metric']: + args.au = config['metric']['au'] diff --git a/dlio_benchmark/utils/statscounter.py b/dlio_benchmark/utils/statscounter.py index 97ceea98..77f42ef9 100644 --- a/dlio_benchmark/utils/statscounter.py +++ b/dlio_benchmark/utils/statscounter.py @@ -142,6 +142,10 @@ def end_run(self): self.summary['epochs'] = len(train_au) self.summary['metric']['train_au_percentage'] = list(train_au) self.summary['metric']['train_au_mean_percentage'] = np.mean(train_au) + if self.summary['metric']['train_au_mean_percentage'] >=self.args.au*100: + self.summary['metric']['train_au_meet_expectation'] = 'success' + else: + self.summary['metric']['train_au_meet_expectation'] = 'fail' self.summary['metric']['train_au_stdev_percentage'] = np.std(train_au) self.summary['metric']['train_throughput_samples_per_second'] = list(train_throughput) self.summary['metric']['train_throughput_mean_samples_per_second'] = np.mean(train_throughput) @@ -153,6 +157,10 @@ def end_run(self): eval_throughput = self.comm.allreduce(self.eval_throughput) self.summary['metric']['eval_au_percentage'] = list(eval_au) self.summary['metric']['eval_au_mean_percentage'] = np.mean(eval_au) + if self.summary['metric']['eval_au_mean_percentage'] >=self.args.au*100: + self.summary['metric']['eval_au_meet_expectation'] = 'success' + else: + self.summary['metric']['eval_au_meet_expectation'] = 'fail' self.summary['metric']['eval_au_stdev_percentage'] = np.std(eval_au) self.summary['metric']['eval_throughput_samples_per_second'] = list(eval_throughput) self.summary['metric']['eval_throughput_mean_samples_per_second'] = np.mean(eval_throughput) diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py index f760255f..ed5fbd5e 100644 --- a/dlio_benchmark/utils/utility.py +++ b/dlio_benchmark/utils/utility.py @@ -33,6 +33,33 @@ import importlib.util # UTC timestamp format with microsecond precision from dlio_benchmark.common.enumerations import LoggerType, MPIState +try: + from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile, DLIO_PROFILER_ENABLE +except: + class Profile(object): + def __init__(self, name=None, cat=None): + self.type = type + def log(self, func): + return func + def log_init(self, func): + return func + def iter(self, a): + return a + def __enter__(self): + return + def __exit__(self, type, value, traceback): + return + def update(self, *, epoch=0, step=0, size=0, default=None): + return + class dlio_logger(object): + def __init__(self,): + self.type = None + def initialize_log(self, logfile=None, data_dir=None, process_id=-1): + return + def iter(self, a): + return a + PerfTrace = dlio_logger() + DLIO_PROFILER_ENABLE = False LOG_TS_FORMAT = "%Y-%m-%dT%H:%M:%S.%f" diff --git a/requirements.txt b/requirements.txt index 26c8828c..537a6552 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ astunparse>=1.6.3 cachetools>=5.2.0 certifi>=2022.9.24 charset-normalizer>=2.1.1 -dlio_profiler_py==0.0.3 +dlio_profiler_py==0.0.5 flatbuffers>=23.5.26 gast>=0.4.0 google-auth>=2.14.1 @@ -20,7 +20,7 @@ Markdown>=3.4.1 MarkupSafe>=2.1.1 mpi4py>=3.1.4 numpy>=1.23.5 ---extra-index-url https://pypi.nvidia.com nvidia-dali-cuda110>=1.34.0 +nvidia-dali-cuda110>=1.34.0 oauthlib>=3.2.2 omegaconf>=2.2.3 opt-einsum>=3.3.0