From 00a253ddd82c916cbc8b1b7ab7c1468d20bd2ded Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 22 Nov 2023 04:18:22 +0000 Subject: [PATCH 01/36] fixed readthedoc build issue --- .readthedocs.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 5fc242ca..092a6b2b 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -30,6 +30,6 @@ sphinx: # Optional but recommended, declare the Python requirements required # to build your documentation # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html -# python: -# install: -# - requirements: docs/requirements.txt \ No newline at end of file +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file From 256cecfe1f0feeb0823484a0afede5a87fe07264 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 1 Dec 2023 20:24:56 +0000 Subject: [PATCH 02/36] partial merged the following PR: https://github.com/argonne-lcf/dlio_benchmark/pull/81 --- dlio_benchmark/common/enumerations.py | 1 + .../data_loader/data_loader_factory.py | 3 + .../data_loader/native_dali_data_loader.py | 60 ++++++++++++++ dlio_benchmark/reader/dali_image_reader.py | 69 ++++++++++++++++ dlio_benchmark/reader/dali_npz_reader.py | 68 ++++++++++++++++ dlio_benchmark/reader/dali_tfrecord_reader.py | 78 +++++++++++++++++++ .../reader/{png_reader.py => image_reader.py} | 4 +- dlio_benchmark/reader/jpeg_reader.py | 62 --------------- dlio_benchmark/reader/npz_reader.py | 60 -------------- dlio_benchmark/reader/reader_factory.py | 26 ++++--- 10 files changed, 297 insertions(+), 134 deletions(-) create mode 100644 dlio_benchmark/data_loader/native_dali_data_loader.py create mode 100644 dlio_benchmark/reader/dali_image_reader.py create mode 100644 dlio_benchmark/reader/dali_npz_reader.py create mode 100644 dlio_benchmark/reader/dali_tfrecord_reader.py rename dlio_benchmark/reader/{png_reader.py => image_reader.py} (96%) delete mode 100644 dlio_benchmark/reader/jpeg_reader.py delete mode 100644 dlio_benchmark/reader/npz_reader.py diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py index 64227772..0081195e 100644 --- a/dlio_benchmark/common/enumerations.py +++ b/dlio_benchmark/common/enumerations.py @@ -124,6 +124,7 @@ class DataLoaderType(Enum): TENSORFLOW='tensorflow' PYTORCH='pytorch' DALI='dali' + NATIVE_DALI = 'native_dali' CUSTOM='custom' NONE='none' diff --git a/dlio_benchmark/data_loader/data_loader_factory.py b/dlio_benchmark/data_loader/data_loader_factory.py index e8457450..13bf16b0 100644 --- a/dlio_benchmark/data_loader/data_loader_factory.py +++ b/dlio_benchmark/data_loader/data_loader_factory.py @@ -45,6 +45,9 @@ def get_loader(type, format_type, dataset_type, epoch): elif type == DataLoaderType.DALI: from dlio_benchmark.data_loader.dali_data_loader import DaliDataLoader return DaliDataLoader(format_type, dataset_type, epoch) + elif type == DataLoaderType.NATIVE_DALI: + from dlio_benchmark.data_loader.native_dali_data_loader import NativeDaliDataLoader + return NativeDaliDataLoader(format_type, dataset_type, epoch) else: print("Data Loader %s not supported or plugins not found" % type) raise Exception(str(ErrorCodes.EC1004)) diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py new file mode 100644 index 00000000..2df04a5e --- /dev/null +++ b/dlio_benchmark/data_loader/native_dali_data_loader.py @@ -0,0 +1,60 @@ +from time import time +import logging +import math +import numpy as np +from nvidia.dali.pipeline import Pipeline +import nvidia.dali.fn as fn +import nvidia.dali.types as types +import nvidia.dali as dali +from nvidia.dali.plugin.pytorch import DALIGenericIterator + +from dlio_benchmark.common.constants import MODULE_DATA_LOADER +from dlio_benchmark.common.enumerations import Shuffle, DataLoaderType, DatasetType +from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader +from dlio_benchmark.reader.reader_factory import ReaderFactory +from dlio_benchmark.utils.utility import utcnow, get_rank, timeit, Profile + +dlp = Profile(MODULE_DATA_LOADER) + + +class NativeDaliDataLoader(BaseDataLoader): + @dlp.log_init + def __init__(self, format_type, dataset_type, epoch): + super().__init__(format_type, dataset_type, epoch, DataLoaderType.NATIVE_DALI) + self.pipelines = [] + + @dlp.log + def read(self): + num_samples = self._args.total_samples_train if self.dataset_type is DatasetType.TRAIN else self._args.total_samples_eval + batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval + parallel = True if self._args.read_threads > 0 else False + self.pipelines = [] + num_threads = 1 + if self._args.read_threads > 0: + num_threads = self._args.read_threads + # None executes pipeline on CPU and the reader does the batching + pipeline = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=None, py_num_workers=num_threads, + exec_async=False, exec_pipelined=False) + with pipeline: + images = ReaderFactory.get_reader(type=self.format_type, + dataset_type=self.dataset_type, + thread_index=-1, + epoch_number=self.epoch_number).read() + pipeline.set_outputs(images) + self.pipelines.append(pipeline) + logging.info(f"{utcnow()} Creating {num_threads} pipelines by {self._args.my_rank} rank ") + + @dlp.log + def next(self): + super().next() + num_samples = self._args.total_samples_train if self.dataset_type is DatasetType.TRAIN else self._args.total_samples_eval + batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval + for step in range(num_samples // batch_size): + _dataset = DALIGenericIterator(self.pipelines, ['data']) + for batch in _dataset: + logging.info(f"{utcnow()} Creating {len(batch)} batches by {self._args.my_rank} rank ") + yield batch + + @dlp.log + def finalize(self): + pass diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py new file mode 100644 index 00000000..adbc2c55 --- /dev/null +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -0,0 +1,69 @@ +""" + Copyright (c) 2022, UChicago Argonne, LLC + All Rights Reserved + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import math +import logging +from time import time + +import nvidia.dali.fn as fn +from dlio_benchmark.common.constants import MODULE_DATA_READER +from dlio_benchmark.reader.dali_base_reader import DaliBaseReader +from dlio_benchmark.reader.tf_base_reader import TFBaseReader +from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile +from dlio_benchmark.common.enumerations import DatasetType, Shuffle +import nvidia.dali.tfrecord as tfrec + +dlp = Profile(MODULE_DATA_READER) + + +class DaliImageReader(DaliBaseReader): + @dlp.log_init + def __init__(self, dataset_type): + super().__init__(dataset_type) + + @dlp.log + def _load(self): + logging.debug( + f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}") + random_shuffle = False + seed = -1 + seed_change_epoch = False + if self._args.sample_shuffle is not Shuffle.OFF: + if self._args.sample_shuffle is not Shuffle.SEED: + seed = self._args.seed + random_shuffle = True + seed_change_epoch = True + initial_fill = 1024 + if self._args.shuffle_size > 0: + initial_fill = self._args.shuffle_size + prefetch_size = 1 + if self._args.prefetch_size > 0: + prefetch_size = self._args.prefetch_size + + stick_to_shard = True + if seed_change_epoch: + stick_to_shard = False + images, labels = fn.readers.file(files=files, num_shards=self._args.comm_size, + prefetch_queue_depth=prefetch_size, + initial_fill=initial_fill, random_shuffle=random_shuffle, + shuffle_after_epoch=seed_change_epoch, + stick_to_shard=stick_to_shard, pad_last_batch=True) + dataset = fn.decoders.image(jpegs, device='cpu') + return dataset + + @dlp.log + def finalize(self): + pass \ No newline at end of file diff --git a/dlio_benchmark/reader/dali_npz_reader.py b/dlio_benchmark/reader/dali_npz_reader.py new file mode 100644 index 00000000..f2887ef5 --- /dev/null +++ b/dlio_benchmark/reader/dali_npz_reader.py @@ -0,0 +1,68 @@ +""" + Copyright (c) 2022, UChicago Argonne, LLC + All Rights Reserved + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import math +import logging +from time import time + +import nvidia.dali.fn as fn +from dlio_benchmark.common.constants import MODULE_DATA_READER +from dlio_benchmark.reader.dali_base_reader import DaliBaseReader +from dlio_benchmark.reader.tf_base_reader import TFBaseReader +from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile +from dlio_benchmark.common.enumerations import DatasetType, Shuffle +import nvidia.dali.tfrecord as tfrec + +dlp = Profile(MODULE_DATA_READER) + + +class DaliNPZReader(DaliBaseReader): + @dlp.log_init + def __init__(self, dataset_type): + super().__init__(dataset_type) + + @dlp.log + def _load(self): + logging.debug( + f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}") + random_shuffle = False + seed = -1 + seed_change_epoch = False + if self._args.sample_shuffle is not Shuffle.OFF: + if self._args.sample_shuffle is not Shuffle.SEED: + seed = self._args.seed + random_shuffle = True + seed_change_epoch = True + initial_fill = 1024 + if self._args.shuffle_size > 0: + initial_fill = self._args.shuffle_size + prefetch_size = 1 + if self._args.prefetch_size > 0: + prefetch_size = self._args.prefetch_size + + stick_to_shard = True + if seed_change_epoch: + stick_to_shard = False + + dataset = fn.readers.numpy(device='cpu', files=self.file_list, num_shards=self._args.comm_size, + prefetch_queue_depth=prefetch_size, initial_fill=initial_fill, + random_shuffle=random_shuffle, seed=seed, shuffle_after_epoch=seed_change_epoch, + stick_to_shard=stick_to_shard, pad_last_batch=True) + return dataset + + @dlp.log + def finalize(self): + pass diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py new file mode 100644 index 00000000..4b8147af --- /dev/null +++ b/dlio_benchmark/reader/dali_tfrecord_reader.py @@ -0,0 +1,78 @@ +""" + Copyright (c) 2022, UChicago Argonne, LLC + All Rights Reserved + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import os.path + +import math +import logging +from time import time + +import nvidia +import nvidia.dali.fn as fn +from dlio_benchmark.common.constants import MODULE_DATA_READER +from dlio_benchmark.reader.dali_base_reader import DaliBaseReader +from dlio_benchmark.reader.tf_base_reader import TFBaseReader +from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile +from dlio_benchmark.common.enumerations import DatasetType, Shuffle +import nvidia.dali.tfrecord as tfrec + +dlp = Profile(MODULE_DATA_READER) + + +class DaliTFRecordReader(DaliBaseReader): + @dlp.log_init + def __init__(self, dataset_type): + super().__init__(dataset_type) + + @dlp.log + def _load(self): + folder = "valid" + if self.dataset_type == DatasetType.TRAIN: + folder = "train" + index_folder = f"{self._args.data_folder}/index/{folder}" + index_files = [] + for file in self.file_list: + filename = os.path.basename(file) + index_files.append(f"{index_folder}/{filename}.idx") + logging.info( + f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}") + random_shuffle = False + seed = -1 + if self._args.sample_shuffle is not Shuffle.OFF: + if self._args.sample_shuffle is not Shuffle.SEED: + seed = self._args.seed + random_shuffle = True + initial_fill = 1024 + if self._args.shuffle_size > 0: + initial_fill = self._args.shuffle_size + prefetch_size = 1 + if self._args.prefetch_size > 0: + prefetch_size = self._args.prefetch_size + dataset = fn.readers.tfrecord(path=self.file_list, + index_path=index_files, + features={ + 'image': tfrec.FixedLenFeature((), tfrec.string, ""), + 'size': tfrec.FixedLenFeature([1], tfrec.int64, 0) + }, num_shards=self._args.comm_size, + prefetch_queue_depth=prefetch_size, + initial_fill=initial_fill, + random_shuffle=random_shuffle, seed=seed, + stick_to_shard=True, pad_last_batch=True) + return dataset["image"] + + @dlp.log + def finalize(self): + pass diff --git a/dlio_benchmark/reader/png_reader.py b/dlio_benchmark/reader/image_reader.py similarity index 96% rename from dlio_benchmark/reader/png_reader.py rename to dlio_benchmark/reader/image_reader.py index 64183dd3..1fe63a05 100644 --- a/dlio_benchmark/reader/png_reader.py +++ b/dlio_benchmark/reader/image_reader.py @@ -26,9 +26,9 @@ dlp = Profile(MODULE_DATA_READER) -class PNGReader(FormatReader): +class ImageReader(FormatReader): """ - Reader for PNG files + Reader for PNG / JPEG files """ @dlp.log_init diff --git a/dlio_benchmark/reader/jpeg_reader.py b/dlio_benchmark/reader/jpeg_reader.py deleted file mode 100644 index 664cde04..00000000 --- a/dlio_benchmark/reader/jpeg_reader.py +++ /dev/null @@ -1,62 +0,0 @@ -""" - Copyright (c) 2022, UChicago Argonne, LLC - All Rights Reserved - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import numpy as np -from PIL import Image - -from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile - -dlp = Profile(MODULE_DATA_READER) - - -class JPEGReader(FormatReader): - """ - Reader for JPEG files - """ - - @dlp.log_init - def __init__(self, dataset_type, thread_index, epoch): - super().__init__(dataset_type, thread_index) - - @dlp.log - def open(self, filename): - super().open(filename) - return np.asarray(Image.open(filename)) - - @dlp.log - def close(self, filename): - super().close(filename) - - @dlp.log - def get_sample(self, filename, sample_index): - super().get_sample(filename, sample_index) - image = self.open_file_map[filename] - dlp.update(image_size=image.nbytes) - - def next(self): - for batch in super().next(): - yield batch - - @dlp.log - def read_index(self, image_idx, step): - return super().read_index(image_idx, step) - - @dlp.log - def finalize(self): - return super().finalize() diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py deleted file mode 100644 index f0144f74..00000000 --- a/dlio_benchmark/reader/npz_reader.py +++ /dev/null @@ -1,60 +0,0 @@ -""" - Copyright (c) 2022, UChicago Argonne, LLC - All Rights Reserved - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" -import numpy as np - -from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_benchmark.reader.reader_handler import FormatReader -from dlio_profiler.logger import fn_interceptor as Profile - -dlp = Profile(MODULE_DATA_READER) - - -class NPZReader(FormatReader): - """ - Reader for NPZ files - """ - - @dlp.log_init - def __init__(self, dataset_type, thread_index, epoch): - super().__init__(dataset_type, thread_index) - - @dlp.log - def open(self, filename): - super().open(filename) - return np.load(filename, allow_pickle=True)['x'] - - @dlp.log - def close(self, filename): - super().close(filename) - - @dlp.log - def get_sample(self, filename, sample_index): - super().get_sample(filename, sample_index) - image = self.open_file_map[filename][..., sample_index] - dlp.update(image_size=image.nbytes) - - def next(self): - for batch in super().next(): - yield batch - - @dlp.log - def read_index(self, image_idx, step): - return super().read_index(image_idx, step) - - @dlp.log - def finalize(self): - return super().finalize() \ No newline at end of file diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py index 74fc353e..e6055dc4 100644 --- a/dlio_benchmark/reader/reader_factory.py +++ b/dlio_benchmark/reader/reader_factory.py @@ -43,18 +43,24 @@ def get_reader(type, dataset_type, thread_index, epoch_number): elif type == FormatType.CSV: from dlio_benchmark.reader.csv_reader import CSVReader return CSVReader(dataset_type, thread_index, epoch_number) - elif type == FormatType.JPEG: - from dlio_benchmark.reader.jpeg_reader import JPEGReader - return JPEGReader(dataset_type, thread_index, epoch_number) - elif type == FormatType.PNG: - from dlio_benchmark.reader.png_reader import PNGReader - return PNGReader(dataset_type, thread_index, epoch_number) + elif type == FormatType.JPEG or FormatType.PNG: + if _args.data_loader == DataLoaderType.NATIVE_DALI + from dlio_benchmark.reader.image_reader import ImageReader + return DaliImageReader(dataset_type, thread_index, epoch_number) elif type == FormatType.NPZ: - from dlio_benchmark.reader.npz_reader import NPZReader - return NPZReader(dataset_type, thread_index, epoch_number) + if _args.data_loader == DataLoaderType.NATIVE_DALI + from dlio_benchmark.reader.dali_npz_reader import DaliNPZReader + return DaliNPZReader(dataset_type, thread_index, epoch_number) + else: + from dlio_benchmark.reader.npz_reader import NPZReader + return NPZReader(dataset_type, thread_index, epoch_number) elif type == FormatType.TFRECORD: - from dlio_benchmark.reader.tf_reader import TFReader - return TFReader(dataset_type, thread_index, epoch_number) + if _args.data_loader == DataLoaderType.NATIVE_DALI: + from dlio_benchmark.reader.dali_tf_reader import DaliTFReader + return TFReader(dataset_type, thread_index, epoch_number) + else: + from dlio_benchmark.reader.tf_reader import TFReader + return TFReader(dataset_type, thread_index, epoch_number) else: print("Loading data of %s format is not supported without framework data loader" %type) raise Exception(type) From 63cb1c16be269a2d648d3662c4ad8e2ed79b0162 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 1 Dec 2023 20:25:43 +0000 Subject: [PATCH 03/36] added back npz_reader --- dlio_benchmark/reader/npz_reader.py | 60 +++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 dlio_benchmark/reader/npz_reader.py diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py new file mode 100644 index 00000000..f0144f74 --- /dev/null +++ b/dlio_benchmark/reader/npz_reader.py @@ -0,0 +1,60 @@ +""" + Copyright (c) 2022, UChicago Argonne, LLC + All Rights Reserved + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from dlio_benchmark.common.constants import MODULE_DATA_READER +from dlio_benchmark.reader.reader_handler import FormatReader +from dlio_profiler.logger import fn_interceptor as Profile + +dlp = Profile(MODULE_DATA_READER) + + +class NPZReader(FormatReader): + """ + Reader for NPZ files + """ + + @dlp.log_init + def __init__(self, dataset_type, thread_index, epoch): + super().__init__(dataset_type, thread_index) + + @dlp.log + def open(self, filename): + super().open(filename) + return np.load(filename, allow_pickle=True)['x'] + + @dlp.log + def close(self, filename): + super().close(filename) + + @dlp.log + def get_sample(self, filename, sample_index): + super().get_sample(filename, sample_index) + image = self.open_file_map[filename][..., sample_index] + dlp.update(image_size=image.nbytes) + + def next(self): + for batch in super().next(): + yield batch + + @dlp.log + def read_index(self, image_idx, step): + return super().read_index(image_idx, step) + + @dlp.log + def finalize(self): + return super().finalize() \ No newline at end of file From cdd73894ef95fa0760adda479fc98ddd5326e77e Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 1 Dec 2023 20:30:16 +0000 Subject: [PATCH 04/36] fixed bugs --- dlio_benchmark/reader/reader_factory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py index e6055dc4..3ec683fb 100644 --- a/dlio_benchmark/reader/reader_factory.py +++ b/dlio_benchmark/reader/reader_factory.py @@ -44,11 +44,11 @@ def get_reader(type, dataset_type, thread_index, epoch_number): from dlio_benchmark.reader.csv_reader import CSVReader return CSVReader(dataset_type, thread_index, epoch_number) elif type == FormatType.JPEG or FormatType.PNG: - if _args.data_loader == DataLoaderType.NATIVE_DALI + if _args.data_loader == DataLoaderType.NATIVE_DALI: from dlio_benchmark.reader.image_reader import ImageReader return DaliImageReader(dataset_type, thread_index, epoch_number) elif type == FormatType.NPZ: - if _args.data_loader == DataLoaderType.NATIVE_DALI + if _args.data_loader == DataLoaderType.NATIVE_DALI: from dlio_benchmark.reader.dali_npz_reader import DaliNPZReader return DaliNPZReader(dataset_type, thread_index, epoch_number) else: From 4f0008e940dcf6cd31bc88dfd96353c430d26378 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 1 Dec 2023 20:43:36 +0000 Subject: [PATCH 05/36] fixed bugs --- dlio_benchmark/reader/reader_factory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py index 3ec683fb..fdd12ee0 100644 --- a/dlio_benchmark/reader/reader_factory.py +++ b/dlio_benchmark/reader/reader_factory.py @@ -43,7 +43,7 @@ def get_reader(type, dataset_type, thread_index, epoch_number): elif type == FormatType.CSV: from dlio_benchmark.reader.csv_reader import CSVReader return CSVReader(dataset_type, thread_index, epoch_number) - elif type == FormatType.JPEG or FormatType.PNG: + elif type == FormatType.JPEG or type == FormatType.PNG: if _args.data_loader == DataLoaderType.NATIVE_DALI: from dlio_benchmark.reader.image_reader import ImageReader return DaliImageReader(dataset_type, thread_index, epoch_number) @@ -57,7 +57,7 @@ def get_reader(type, dataset_type, thread_index, epoch_number): elif type == FormatType.TFRECORD: if _args.data_loader == DataLoaderType.NATIVE_DALI: from dlio_benchmark.reader.dali_tf_reader import DaliTFReader - return TFReader(dataset_type, thread_index, epoch_number) + return DaliTFReader(dataset_type, thread_index, epoch_number) else: from dlio_benchmark.reader.tf_reader import TFReader return TFReader(dataset_type, thread_index, epoch_number) From fbc9748b3f6416696db21e38b619b62c8818a4d3 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 1 Dec 2023 20:48:50 +0000 Subject: [PATCH 06/36] fixed image reader issue --- dlio_benchmark/reader/reader_factory.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py index fdd12ee0..b7f39cb5 100644 --- a/dlio_benchmark/reader/reader_factory.py +++ b/dlio_benchmark/reader/reader_factory.py @@ -45,8 +45,11 @@ def get_reader(type, dataset_type, thread_index, epoch_number): return CSVReader(dataset_type, thread_index, epoch_number) elif type == FormatType.JPEG or type == FormatType.PNG: if _args.data_loader == DataLoaderType.NATIVE_DALI: - from dlio_benchmark.reader.image_reader import ImageReader + from dlio_benchmark.reader.dali_image_reader import DaliImageReader return DaliImageReader(dataset_type, thread_index, epoch_number) + else: + from dlio_benchmark.reader.image_reader import ImageReader + return ImageReader(dataset_type, thread_index, epoch_number) elif type == FormatType.NPZ: if _args.data_loader == DataLoaderType.NATIVE_DALI: from dlio_benchmark.reader.dali_npz_reader import DaliNPZReader From 9bde93cad3f7c39dff1f9290403b03fffffff884 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 6 Dec 2023 21:48:22 +0000 Subject: [PATCH 07/36] fixed Profile, PerfTrace --- dlio_benchmark/data_generator/tf_generator.py | 15 ++++- .../data_loader/native_dali_data_loader.py | 3 +- dlio_benchmark/reader/dali_base_reader.py | 65 +++++++++++++++++++ dlio_benchmark/reader/dali_image_reader.py | 4 +- dlio_benchmark/reader/dali_npz_reader.py | 4 +- dlio_benchmark/reader/dali_tfrecord_reader.py | 4 +- dlio_benchmark/reader/reader_factory.py | 8 +-- dlio_benchmark/utils/config.py | 6 +- 8 files changed, 93 insertions(+), 16 deletions(-) create mode 100644 dlio_benchmark/reader/dali_base_reader.py diff --git a/dlio_benchmark/data_generator/tf_generator.py b/dlio_benchmark/data_generator/tf_generator.py index f10a9621..b1a94c9c 100644 --- a/dlio_benchmark/data_generator/tf_generator.py +++ b/dlio_benchmark/data_generator/tf_generator.py @@ -14,12 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. """ +import os +from subprocess import call from dlio_benchmark.data_generator.data_generator import DataGenerator import numpy as np import tensorflow as tf -from dlio_benchmark.utils.utility import progress, utcnow from dlio_profiler.logger import fn_interceptor as Profile + +from dlio_benchmark.utils.utility import progress, utcnow from shutil import copyfile from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR @@ -64,4 +67,14 @@ def generate(self): serialized = example.SerializeToString() # Write the serialized data to the TFRecords file. writer.write(serialized) + tfrecord2idx_script = "tfrecord2idx" + folder = "train" + if "valid" in out_path_spec: + folder = "valid" + index_folder = f"{self._args.data_folder}/index/{folder}" + filename = os.path.basename(out_path_spec) + self.storage.create_node(index_folder, exist_ok=True) + tfrecord_idx = f"{index_folder}/{filename}.idx" + if not os.path.isfile(tfrecord_idx): + call([tfrecord2idx_script, out_path_spec, tfrecord_idx]) np.random.seed() diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py index 2df04a5e..86713315 100644 --- a/dlio_benchmark/data_loader/native_dali_data_loader.py +++ b/dlio_benchmark/data_loader/native_dali_data_loader.py @@ -12,7 +12,8 @@ from dlio_benchmark.common.enumerations import Shuffle, DataLoaderType, DatasetType from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory -from dlio_benchmark.utils.utility import utcnow, get_rank, timeit, Profile +from dlio_benchmark.utils.utility import utcnow, get_rank, timeit +from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile dlp = Profile(MODULE_DATA_LOADER) diff --git a/dlio_benchmark/reader/dali_base_reader.py b/dlio_benchmark/reader/dali_base_reader.py new file mode 100644 index 00000000..b9dac9f1 --- /dev/null +++ b/dlio_benchmark/reader/dali_base_reader.py @@ -0,0 +1,65 @@ +""" + Copyright (c) 2022, UChicago Argonne, LLC + All Rights Reserved + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from time import sleep + +import numpy as np +import nvidia +from abc import ABC, abstractmethod +from nvidia import dali + +from dlio_benchmark.common.constants import MODULE_DATA_READER +from dlio_benchmark.common.enumerations import DatasetType +from dlio_benchmark.utils.config import ConfigArguments +from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile + +from nvidia.dali import fn +dlp = Profile(MODULE_DATA_READER) + +class DaliBaseReader(ABC): + + @dlp.log_init + def __init__(self, dataset_type): + self.dataset_type = dataset_type + self._args = ConfigArguments.get_instance() + self.batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval + self.file_list = self._args.file_list_train if self.dataset_type is DatasetType.TRAIN else self._args.file_list_eval + + @dlp.log + def _preprocess(self, dataset): + if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.: + t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev) + sleep(max(t, 0.0)) + return dataset + + @dlp.log + def _resize(self, dataset): + return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension]) + + @abstractmethod + def _load(self): + pass + + @dlp.log + def read(self): + dataset = self._load() + #dataset = self._resize(dataset) + #dataset = nvidia.dali.fn.python_function(dataset, function= self._preprocess, num_outputs=1) + return dataset + + @abstractmethod + def finalize(self): + pass diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py index adbc2c55..98f839b1 100644 --- a/dlio_benchmark/reader/dali_image_reader.py +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -21,10 +21,10 @@ import nvidia.dali.fn as fn from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.dali_base_reader import DaliBaseReader -from dlio_benchmark.reader.tf_base_reader import TFBaseReader -from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile +from dlio_benchmark.utils.utility import utcnow from dlio_benchmark.common.enumerations import DatasetType, Shuffle import nvidia.dali.tfrecord as tfrec +from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/dali_npz_reader.py b/dlio_benchmark/reader/dali_npz_reader.py index f2887ef5..9114716f 100644 --- a/dlio_benchmark/reader/dali_npz_reader.py +++ b/dlio_benchmark/reader/dali_npz_reader.py @@ -21,10 +21,10 @@ import nvidia.dali.fn as fn from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.dali_base_reader import DaliBaseReader -from dlio_benchmark.reader.tf_base_reader import TFBaseReader -from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile +from dlio_benchmark.utils.utility import utcnow from dlio_benchmark.common.enumerations import DatasetType, Shuffle import nvidia.dali.tfrecord as tfrec +from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py index 4b8147af..d66285d5 100644 --- a/dlio_benchmark/reader/dali_tfrecord_reader.py +++ b/dlio_benchmark/reader/dali_tfrecord_reader.py @@ -24,10 +24,10 @@ import nvidia.dali.fn as fn from dlio_benchmark.common.constants import MODULE_DATA_READER from dlio_benchmark.reader.dali_base_reader import DaliBaseReader -from dlio_benchmark.reader.tf_base_reader import TFBaseReader -from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile +from dlio_benchmark.utils.utility import utcnow from dlio_benchmark.common.enumerations import DatasetType, Shuffle import nvidia.dali.tfrecord as tfrec +from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile dlp = Profile(MODULE_DATA_READER) diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py index b7f39cb5..10d7bc6c 100644 --- a/dlio_benchmark/reader/reader_factory.py +++ b/dlio_benchmark/reader/reader_factory.py @@ -46,21 +46,21 @@ def get_reader(type, dataset_type, thread_index, epoch_number): elif type == FormatType.JPEG or type == FormatType.PNG: if _args.data_loader == DataLoaderType.NATIVE_DALI: from dlio_benchmark.reader.dali_image_reader import DaliImageReader - return DaliImageReader(dataset_type, thread_index, epoch_number) + return DaliImageReader(dataset_type) else: from dlio_benchmark.reader.image_reader import ImageReader return ImageReader(dataset_type, thread_index, epoch_number) elif type == FormatType.NPZ: if _args.data_loader == DataLoaderType.NATIVE_DALI: from dlio_benchmark.reader.dali_npz_reader import DaliNPZReader - return DaliNPZReader(dataset_type, thread_index, epoch_number) + return DaliNPZReader(dataset_type) else: from dlio_benchmark.reader.npz_reader import NPZReader return NPZReader(dataset_type, thread_index, epoch_number) elif type == FormatType.TFRECORD: if _args.data_loader == DataLoaderType.NATIVE_DALI: - from dlio_benchmark.reader.dali_tf_reader import DaliTFReader - return DaliTFReader(dataset_type, thread_index, epoch_number) + from dlio_benchmark.reader.dali_tfrecord_reader import DaliTFRecordReader + return DaliTFRecordReader(dataset_type) else: from dlio_benchmark.reader.tf_reader import TFReader return TFReader(dataset_type, thread_index, epoch_number) diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index c6d319d7..05bcd400 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -149,10 +149,8 @@ def validate(self): if (self.do_profiling == True) and (self.profiler == Profiler('darshan')): if ('LD_PRELOAD' not in os.environ or os.environ["LD_PRELOAD"].find("libdarshan") == -1): raise Exception("Please set darshan runtime library in LD_PRELOAD") - if self.format is FormatType.TFRECORD and self.framework is not FrameworkType.TENSORFLOW: - raise Exception(f"{self.framework} support for tfrecord is not implemented.") - if self.format is FormatType.TFRECORD and self.data_loader is not DataLoaderType.TENSORFLOW: - raise Exception(f"{self.data_loader} support for tfrecord is not implemented.") + if self.format is FormatType.TFRECORD and (self.data_loader is DataLoaderType.PYTORCH): + raise Exception(f"{self.framework} support for tfrecord is not implemented for {self.data_loader}.") if (self.framework == FrameworkType.TENSORFLOW and self.data_loader == DataLoaderType.PYTORCH) or ( self.framework == FrameworkType.PYTORCH and self.data_loader == DataLoaderType.TENSORFLOW): raise Exception("Imcompatible between framework and data_loader setup.") From c976bc54fe59feae2a88439c0d35d49848723c46 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 6 Dec 2023 21:52:01 +0000 Subject: [PATCH 08/36] removed unnecessary logs --- dlio_benchmark/data_loader/native_dali_data_loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py index 86713315..838f2d1b 100644 --- a/dlio_benchmark/data_loader/native_dali_data_loader.py +++ b/dlio_benchmark/data_loader/native_dali_data_loader.py @@ -43,7 +43,6 @@ def read(self): epoch_number=self.epoch_number).read() pipeline.set_outputs(images) self.pipelines.append(pipeline) - logging.info(f"{utcnow()} Creating {num_threads} pipelines by {self._args.my_rank} rank ") @dlp.log def next(self): From 58febf3298ce3feeb1c4f948c16cd6551c7f74b3 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 6 Dec 2023 22:01:13 +0000 Subject: [PATCH 09/36] fixed dali_image_reader --- dlio_benchmark/reader/dali_image_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py index 98f839b1..08518545 100644 --- a/dlio_benchmark/reader/dali_image_reader.py +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -56,7 +56,7 @@ def _load(self): stick_to_shard = True if seed_change_epoch: stick_to_shard = False - images, labels = fn.readers.file(files=files, num_shards=self._args.comm_size, + images, labels = fn.readers.file(files=self.file_list, num_shards=self._args.comm_size, prefetch_queue_depth=prefetch_size, initial_fill=initial_fill, random_shuffle=random_shuffle, shuffle_after_epoch=seed_change_epoch, From 3cee0fda1189558b7f153feb655011bbdde63cb0 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 6 Dec 2023 22:02:00 +0000 Subject: [PATCH 10/36] fixed dali_image_reader --- dlio_benchmark/reader/dali_image_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py index 08518545..b814de86 100644 --- a/dlio_benchmark/reader/dali_image_reader.py +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -61,7 +61,7 @@ def _load(self): initial_fill=initial_fill, random_shuffle=random_shuffle, shuffle_after_epoch=seed_change_epoch, stick_to_shard=stick_to_shard, pad_last_batch=True) - dataset = fn.decoders.image(jpegs, device='cpu') + dataset = fn.decoders.image(images, device='cpu') return dataset @dlp.log From a666d5c3d5f3a7ebca1c1013d3a3756a6907d688 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 6 Dec 2023 22:15:40 +0000 Subject: [PATCH 11/36] added support for npy format --- dlio_benchmark/common/enumerations.py | 3 + .../data_generator/generator_factory.py | 3 + ...{dali_npz_reader.py => dali_npy_reader.py} | 2 +- dlio_benchmark/reader/npy_reader.py | 60 +++++++++++++++++++ dlio_benchmark/reader/reader_factory.py | 13 +++- 5 files changed, 77 insertions(+), 4 deletions(-) rename dlio_benchmark/reader/{dali_npz_reader.py => dali_npy_reader.py} (98%) create mode 100644 dlio_benchmark/reader/npy_reader.py diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py index 0081195e..c8c448b5 100644 --- a/dlio_benchmark/common/enumerations.py +++ b/dlio_benchmark/common/enumerations.py @@ -93,6 +93,7 @@ class FormatType(Enum): HDF5 = 'hdf5' CSV = 'csv' NPZ = 'npz' + NPY = 'npy' HDF5_OPT = 'hdf5_opt' JPEG = 'jpeg' PNG = 'png' @@ -110,6 +111,8 @@ def get_enum(value): return FormatType.CSV elif FormatType.NPZ.value == value: return FormatType.NPZ + elif FormatType.NPY.value == value: + return FormatType.NPY elif FormatType.HDF5_OPT.value == value: return FormatType.HDF5_OPT elif FormatType.JPEG.value == value: diff --git a/dlio_benchmark/data_generator/generator_factory.py b/dlio_benchmark/data_generator/generator_factory.py index 7c05d3a4..e61ead4c 100644 --- a/dlio_benchmark/data_generator/generator_factory.py +++ b/dlio_benchmark/data_generator/generator_factory.py @@ -38,6 +38,9 @@ def get_generator(type): elif type == FormatType.NPZ: from dlio_benchmark.data_generator.npz_generator import NPZGenerator return NPZGenerator() + elif type == FormatType.NPY: + from dlio_benchmark.data_generator.npy_generator import NPYGenerator + return NPYGenerator() elif type == FormatType.JPEG: from dlio_benchmark.data_generator.jpeg_generator import JPEGGenerator return JPEGGenerator() diff --git a/dlio_benchmark/reader/dali_npz_reader.py b/dlio_benchmark/reader/dali_npy_reader.py similarity index 98% rename from dlio_benchmark/reader/dali_npz_reader.py rename to dlio_benchmark/reader/dali_npy_reader.py index 9114716f..ee0592ef 100644 --- a/dlio_benchmark/reader/dali_npz_reader.py +++ b/dlio_benchmark/reader/dali_npy_reader.py @@ -29,7 +29,7 @@ dlp = Profile(MODULE_DATA_READER) -class DaliNPZReader(DaliBaseReader): +class DaliNPYReader(DaliBaseReader): @dlp.log_init def __init__(self, dataset_type): super().__init__(dataset_type) diff --git a/dlio_benchmark/reader/npy_reader.py b/dlio_benchmark/reader/npy_reader.py new file mode 100644 index 00000000..435df6b3 --- /dev/null +++ b/dlio_benchmark/reader/npy_reader.py @@ -0,0 +1,60 @@ +""" + Copyright (c) 2022, UChicago Argonne, LLC + All Rights Reserved + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +import numpy as np + +from dlio_benchmark.common.constants import MODULE_DATA_READER +from dlio_benchmark.reader.reader_handler import FormatReader +from dlio_profiler.logger import fn_interceptor as Profile + +dlp = Profile(MODULE_DATA_READER) + + +class NPYReader(FormatReader): + """ + Reader for NPZ files + """ + + @dlp.log_init + def __init__(self, dataset_type, thread_index, epoch): + super().__init__(dataset_type, thread_index) + + @dlp.log + def open(self, filename): + super().open(filename) + return np.load(filename) + + @dlp.log + def close(self, filename): + super().close(filename) + + @dlp.log + def get_sample(self, filename, sample_index): + super().get_sample(filename, sample_index) + image = self.open_file_map[filename][..., sample_index] + dlp.update(image_size=image.nbytes) + + def next(self): + for batch in super().next(): + yield batch + + @dlp.log + def read_index(self, image_idx, step): + return super().read_index(image_idx, step) + + @dlp.log + def finalize(self): + return super().finalize() \ No newline at end of file diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py index 10d7bc6c..68f9bbb8 100644 --- a/dlio_benchmark/reader/reader_factory.py +++ b/dlio_benchmark/reader/reader_factory.py @@ -49,11 +49,18 @@ def get_reader(type, dataset_type, thread_index, epoch_number): return DaliImageReader(dataset_type) else: from dlio_benchmark.reader.image_reader import ImageReader - return ImageReader(dataset_type, thread_index, epoch_number) + return ImageReader(dataset_type, thread_index, epoch_number) + elif type == FormatType.NPY: + if _args.data_loader == DataLoaderType.NATIVE_DALI: + from dlio_benchmark.reader.dali_npy_reader import DaliNPYReader + return DaliNPYReader(dataset_type) + else: + from dlio_benchmark.reader.npy_reader import NPYReader + return NPYReader(dataset_type, thread_index, epoch_number) elif type == FormatType.NPZ: if _args.data_loader == DataLoaderType.NATIVE_DALI: - from dlio_benchmark.reader.dali_npz_reader import DaliNPZReader - return DaliNPZReader(dataset_type) + print("Loading data of %s format is not supported without framework data loader" %type) + raise Exception(type) else: from dlio_benchmark.reader.npz_reader import NPZReader return NPZReader(dataset_type, thread_index, epoch_number) From f0722b6fbac42d2ec51107df20c330c66e8e62df Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Wed, 6 Dec 2023 22:17:51 +0000 Subject: [PATCH 12/36] added support for npy format --- dlio_benchmark/data_loader/native_dali_data_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py index 838f2d1b..8799800b 100644 --- a/dlio_benchmark/data_loader/native_dali_data_loader.py +++ b/dlio_benchmark/data_loader/native_dali_data_loader.py @@ -52,7 +52,7 @@ def next(self): for step in range(num_samples // batch_size): _dataset = DALIGenericIterator(self.pipelines, ['data']) for batch in _dataset: - logging.info(f"{utcnow()} Creating {len(batch)} batches by {self._args.my_rank} rank ") + logging.debug(f"{utcnow()} Creating {len(batch)} batches by {self._args.my_rank} rank ") yield batch @dlp.log From 7155d5d87e79e54102aaa0046b1833558a248a96 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Fri, 8 Dec 2023 19:35:33 +0000 Subject: [PATCH 13/36] changed enumerations --- dlio_benchmark/common/enumerations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py index 0081195e..ef5b48a8 100644 --- a/dlio_benchmark/common/enumerations.py +++ b/dlio_benchmark/common/enumerations.py @@ -124,7 +124,7 @@ class DataLoaderType(Enum): TENSORFLOW='tensorflow' PYTORCH='pytorch' DALI='dali' - NATIVE_DALI = 'native_dali' + NATIVE_DALI='native_dali' CUSTOM='custom' NONE='none' From 9935840128695b848e550b4aff0f4409035d1594 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 16:54:39 +0000 Subject: [PATCH 14/36] added removed dali base reader --- dlio_benchmark/reader/dali_base_reader.py | 65 ------------------- dlio_benchmark/reader/dali_image_reader.py | 23 +++++-- dlio_benchmark/reader/dali_npy_reader.py | 24 +++++-- dlio_benchmark/reader/dali_tfrecord_reader.py | 25 +++++-- dlio_benchmark/reader/reader_factory.py | 6 +- dlio_benchmark/reader/reader_handler.py | 4 ++ 6 files changed, 63 insertions(+), 84 deletions(-) delete mode 100644 dlio_benchmark/reader/dali_base_reader.py diff --git a/dlio_benchmark/reader/dali_base_reader.py b/dlio_benchmark/reader/dali_base_reader.py deleted file mode 100644 index b9dac9f1..00000000 --- a/dlio_benchmark/reader/dali_base_reader.py +++ /dev/null @@ -1,65 +0,0 @@ -""" - Copyright (c) 2022, UChicago Argonne, LLC - All Rights Reserved - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" -from time import sleep - -import numpy as np -import nvidia -from abc import ABC, abstractmethod -from nvidia import dali - -from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_benchmark.common.enumerations import DatasetType -from dlio_benchmark.utils.config import ConfigArguments -from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile - -from nvidia.dali import fn -dlp = Profile(MODULE_DATA_READER) - -class DaliBaseReader(ABC): - - @dlp.log_init - def __init__(self, dataset_type): - self.dataset_type = dataset_type - self._args = ConfigArguments.get_instance() - self.batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval - self.file_list = self._args.file_list_train if self.dataset_type is DatasetType.TRAIN else self._args.file_list_eval - - @dlp.log - def _preprocess(self, dataset): - if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.: - t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev) - sleep(max(t, 0.0)) - return dataset - - @dlp.log - def _resize(self, dataset): - return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension]) - - @abstractmethod - def _load(self): - pass - - @dlp.log - def read(self): - dataset = self._load() - #dataset = self._resize(dataset) - #dataset = nvidia.dali.fn.python_function(dataset, function= self._preprocess, num_outputs=1) - return dataset - - @abstractmethod - def finalize(self): - pass diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py index b814de86..258e9c7d 100644 --- a/dlio_benchmark/reader/dali_image_reader.py +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -20,7 +20,7 @@ import nvidia.dali.fn as fn from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_benchmark.reader.dali_base_reader import DaliBaseReader +from dlio_benchmark.dlio_benchmark.reader.reader_handler import FormatReader from dlio_benchmark.utils.utility import utcnow from dlio_benchmark.common.enumerations import DatasetType, Shuffle import nvidia.dali.tfrecord as tfrec @@ -29,13 +29,13 @@ dlp = Profile(MODULE_DATA_READER) -class DaliImageReader(DaliBaseReader): +class DaliImageReader(FormatReader): @dlp.log_init - def __init__(self, dataset_type): - super().__init__(dataset_type) + def __init__(dataset_type, thread_index, epoch): + super().__init__(dataset_type, thread_index) @dlp.log - def _load(self): + def read(self): logging.debug( f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}") random_shuffle = False @@ -62,8 +62,21 @@ def _load(self): shuffle_after_epoch=seed_change_epoch, stick_to_shard=stick_to_shard, pad_last_batch=True) dataset = fn.decoders.image(images, device='cpu') + dataset = self._preprocess(images) + dataset = self._resize(images) return dataset + @dlp.log + def _preprocess(self, dataset): + if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.: + t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev) + sleep(max(t, 0.0)) + return dataset + + @dlp.log + def _resize(self, dataset): + return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension]) + @dlp.log def finalize(self): pass \ No newline at end of file diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py index ee0592ef..f2b12730 100644 --- a/dlio_benchmark/reader/dali_npy_reader.py +++ b/dlio_benchmark/reader/dali_npy_reader.py @@ -20,7 +20,7 @@ import nvidia.dali.fn as fn from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_benchmark.reader.dali_base_reader import DaliBaseReader +from dlio_benchmark.dlio_benchmark.reader.reader_handler import FormatReader from dlio_benchmark.utils.utility import utcnow from dlio_benchmark.common.enumerations import DatasetType, Shuffle import nvidia.dali.tfrecord as tfrec @@ -29,13 +29,13 @@ dlp = Profile(MODULE_DATA_READER) -class DaliNPYReader(DaliBaseReader): +class DaliNPYReader(NativeDaliBaseReader): @dlp.log_init - def __init__(self, dataset_type): - super().__init__(dataset_type) + def __init__(dataset_type, thread_index, epoch): + super().__init__(dataset_type, thread_index) @dlp.log - def _load(self): + def read(self): logging.debug( f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}") random_shuffle = False @@ -61,8 +61,22 @@ def _load(self): prefetch_queue_depth=prefetch_size, initial_fill=initial_fill, random_shuffle=random_shuffle, seed=seed, shuffle_after_epoch=seed_change_epoch, stick_to_shard=stick_to_shard, pad_last_batch=True) + + dataset = self._preprocess(dataset) + dataset = self._resize(dataset) return dataset + + @dlp.log + def _preprocess(self, dataset): + if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.: + t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev) + sleep(max(t, 0.0)) + return dataset + + @dlp.log + def _resize(self, dataset): + return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension]) @dlp.log def finalize(self): pass diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py index d66285d5..129e6d44 100644 --- a/dlio_benchmark/reader/dali_tfrecord_reader.py +++ b/dlio_benchmark/reader/dali_tfrecord_reader.py @@ -23,7 +23,7 @@ import nvidia import nvidia.dali.fn as fn from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_benchmark.reader.dali_base_reader import DaliBaseReader +from dlio_benchmark.dlio_benchmark.reader.reader_handler import FormatReader from dlio_benchmark.utils.utility import utcnow from dlio_benchmark.common.enumerations import DatasetType, Shuffle import nvidia.dali.tfrecord as tfrec @@ -32,13 +32,13 @@ dlp = Profile(MODULE_DATA_READER) -class DaliTFRecordReader(DaliBaseReader): +class DaliTFRecordReader(FormatReader): @dlp.log_init - def __init__(self, dataset_type): - super().__init__(dataset_type) + def __init__(dataset_type, thread_index, epoch): + super().__init__(dataset_type, thread_index) @dlp.log - def _load(self): + def read(self): folder = "valid" if self.dataset_type == DatasetType.TRAIN: folder = "train" @@ -71,7 +71,20 @@ def _load(self): initial_fill=initial_fill, random_shuffle=random_shuffle, seed=seed, stick_to_shard=True, pad_last_batch=True) - return dataset["image"] + dataset = self._preprocess(dataset["image"]) + dataset = self._resize(dataset) + return dataset + + @dlp.log + def _preprocess(self, dataset): + if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.: + t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev) + sleep(max(t, 0.0)) + return dataset + + @dlp.log + def _resize(self, dataset): + return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension]) @dlp.log def finalize(self): diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py index 68f9bbb8..c1404722 100644 --- a/dlio_benchmark/reader/reader_factory.py +++ b/dlio_benchmark/reader/reader_factory.py @@ -46,14 +46,14 @@ def get_reader(type, dataset_type, thread_index, epoch_number): elif type == FormatType.JPEG or type == FormatType.PNG: if _args.data_loader == DataLoaderType.NATIVE_DALI: from dlio_benchmark.reader.dali_image_reader import DaliImageReader - return DaliImageReader(dataset_type) + return DaliImageReader(dataset_type, thread_index, epoch_number) else: from dlio_benchmark.reader.image_reader import ImageReader return ImageReader(dataset_type, thread_index, epoch_number) elif type == FormatType.NPY: if _args.data_loader == DataLoaderType.NATIVE_DALI: from dlio_benchmark.reader.dali_npy_reader import DaliNPYReader - return DaliNPYReader(dataset_type) + return DaliNPYReader(dataset_type, thread_index, epoch_number) else: from dlio_benchmark.reader.npy_reader import NPYReader return NPYReader(dataset_type, thread_index, epoch_number) @@ -67,7 +67,7 @@ def get_reader(type, dataset_type, thread_index, epoch_number): elif type == FormatType.TFRECORD: if _args.data_loader == DataLoaderType.NATIVE_DALI: from dlio_benchmark.reader.dali_tfrecord_reader import DaliTFRecordReader - return DaliTFRecordReader(dataset_type) + return DaliTFRecordReader(dataset_type, thread_index, epoch_number) else: from dlio_benchmark.reader.tf_reader import TFReader return TFReader(dataset_type, thread_index, epoch_number) diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index 48675ab3..1b6f9189 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -124,6 +124,10 @@ def finalize(self): self.close(filename) self.open_file_map[filename] = None + @abstractmethod + def read(self): + pass + def __del__(self): self.thread_index = None self._args = None From 5dc3907e627da6ad93b196bd73a5ac701778c26d Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 16:56:20 +0000 Subject: [PATCH 15/36] fixed a bug --- dlio_benchmark/reader/dali_npy_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py index f2b12730..a037a0db 100644 --- a/dlio_benchmark/reader/dali_npy_reader.py +++ b/dlio_benchmark/reader/dali_npy_reader.py @@ -29,7 +29,7 @@ dlp = Profile(MODULE_DATA_READER) -class DaliNPYReader(NativeDaliBaseReader): +class DaliNPYReader(FormatReader): @dlp.log_init def __init__(dataset_type, thread_index, epoch): super().__init__(dataset_type, thread_index) From 3fb3602d589d0b22f33013d7e22535a77a433070 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 17:04:20 +0000 Subject: [PATCH 16/36] added native-dali-loader tests in github action --- .github/workflows/python-package-conda.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 1108991f..ff8d3ee0 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -149,6 +149,16 @@ jobs: source ${VENV}/bin/activate mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + - name: test-torch-native-dali-loader-npz + run: | + source ${VENV}/bin/activate + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + - name: test-tf-native-dali-loader-npz + run: | + source ${VENV}/bin/activate + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - name: test_subset run: | source ${VENV}/bin/activate From 248cfa2641cc39a1b9a11e2a2ca3de54fe96b2c7 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 17:05:53 +0000 Subject: [PATCH 17/36] corrected github action formats --- .github/workflows/python-package-conda.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index ff8d3ee0..f7f579d4 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -150,15 +150,15 @@ jobs: mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - name: test-torch-native-dali-loader-npz - run: | - source ${VENV}/bin/activate - mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + run: | + source ${VENV}/bin/activate + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - name: test-tf-native-dali-loader-npz - run: | - source ${VENV}/bin/activate - mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + run: | + source ${VENV}/bin/activate + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - name: test_subset run: | source ${VENV}/bin/activate From 344298d68532b5977ede1a3fa9c8e93eb7fa1307 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 17:14:37 +0000 Subject: [PATCH 18/36] fixed read return --- dlio_benchmark/reader/reader_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index 1b6f9189..6370e4fb 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -126,7 +126,7 @@ def finalize(self): @abstractmethod def read(self): - pass + return def __del__(self): self.thread_index = None From ac025eb7ded7bff8e45c70dc42fb015291d70650 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 19:20:59 +0000 Subject: [PATCH 19/36] removed abstractmethod --- dlio_benchmark/reader/reader_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index 6370e4fb..8084bbdd 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -124,8 +124,8 @@ def finalize(self): self.close(filename) self.open_file_map[filename] = None - @abstractmethod def read(self): + logging.error("This method is not implemented!") return def __del__(self): From d2d544f58f92afa99191835b68e89d2ce65419dd Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 21:09:51 +0000 Subject: [PATCH 20/36] fixed bugs --- dlio_benchmark/reader/dali_image_reader.py | 22 +++++++++++--- dlio_benchmark/reader/dali_npy_reader.py | 29 +++++++++++++++---- dlio_benchmark/reader/dali_tfrecord_reader.py | 21 +++++++++++--- dlio_benchmark/reader/hdf5_reader.py | 4 +++ dlio_benchmark/reader/image_reader.py | 4 +++ dlio_benchmark/reader/npy_reader.py | 6 +++- dlio_benchmark/reader/npz_reader.py | 6 +++- dlio_benchmark/reader/reader_factory.py | 6 ++-- dlio_benchmark/reader/reader_handler.py | 4 +-- dlio_benchmark/reader/tf_reader.py | 8 +++-- 10 files changed, 86 insertions(+), 24 deletions(-) diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py index 258e9c7d..12c2f1da 100644 --- a/dlio_benchmark/reader/dali_image_reader.py +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -31,13 +31,27 @@ class DaliImageReader(FormatReader): @dlp.log_init - def __init__(dataset_type, thread_index, epoch): + def __init__(self, dataset_type, thread_index, epoch): super().__init__(dataset_type, thread_index) + def open(self): + super().open() + + def close(self): + super().close() + + def get_sample(self, filename, sample_index): + super().get_sample(filename, sample_index) + + def next(self): + super().next() + + def read_index(self): + super().read_index() @dlp.log def read(self): logging.debug( - f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}") + f"{utcnow()} Reading {len(self._file_list)} files rank {self._args.my_rank}") random_shuffle = False seed = -1 seed_change_epoch = False @@ -56,7 +70,7 @@ def read(self): stick_to_shard = True if seed_change_epoch: stick_to_shard = False - images, labels = fn.readers.file(files=self.file_list, num_shards=self._args.comm_size, + images, labels = fn.readers.file(files=self._file_list, num_shards=self._args.comm_size, prefetch_queue_depth=prefetch_size, initial_fill=initial_fill, random_shuffle=random_shuffle, shuffle_after_epoch=seed_change_epoch, @@ -75,7 +89,7 @@ def _preprocess(self, dataset): @dlp.log def _resize(self, dataset): - return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension]) + return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension]) @dlp.log def finalize(self): diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py index a037a0db..c2d3bf4f 100644 --- a/dlio_benchmark/reader/dali_npy_reader.py +++ b/dlio_benchmark/reader/dali_npy_reader.py @@ -20,7 +20,7 @@ import nvidia.dali.fn as fn from dlio_benchmark.common.constants import MODULE_DATA_READER -from dlio_benchmark.dlio_benchmark.reader.reader_handler import FormatReader +from dlio_benchmark.reader.reader_handler import FormatReader from dlio_benchmark.utils.utility import utcnow from dlio_benchmark.common.enumerations import DatasetType, Shuffle import nvidia.dali.tfrecord as tfrec @@ -31,13 +31,28 @@ class DaliNPYReader(FormatReader): @dlp.log_init - def __init__(dataset_type, thread_index, epoch): + def __init__(self, dataset_type, thread_index, epoch): super().__init__(dataset_type, thread_index) + def open(self): + super().open() + + def close(self): + super().close() + + def get_sample(self, filename, sample_index): + super().get_sample(filename, sample_index) + + def next(self): + super().next() + + def read_index(self): + super().read_index() + @dlp.log def read(self): logging.debug( - f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}") + f"{utcnow()} Reading {len(self._file_list)} files rank {self._args.my_rank}") random_shuffle = False seed = -1 seed_change_epoch = False @@ -54,10 +69,13 @@ def read(self): prefetch_size = self._args.prefetch_size stick_to_shard = True + if random_shuffle == True: + seed_change_epoch = False if seed_change_epoch: stick_to_shard = False - dataset = fn.readers.numpy(device='cpu', files=self.file_list, num_shards=self._args.comm_size, + + dataset = fn.readers.numpy(device='cpu', files=self._file_list, num_shards=self._args.comm_size, prefetch_queue_depth=prefetch_size, initial_fill=initial_fill, random_shuffle=random_shuffle, seed=seed, shuffle_after_epoch=seed_change_epoch, stick_to_shard=stick_to_shard, pad_last_batch=True) @@ -76,7 +94,8 @@ def _preprocess(self, dataset): @dlp.log def _resize(self, dataset): - return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension]) + return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension]) + @dlp.log def finalize(self): pass diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py index 129e6d44..856d8bfd 100644 --- a/dlio_benchmark/reader/dali_tfrecord_reader.py +++ b/dlio_benchmark/reader/dali_tfrecord_reader.py @@ -34,9 +34,22 @@ class DaliTFRecordReader(FormatReader): @dlp.log_init - def __init__(dataset_type, thread_index, epoch): + def __init__(self, dataset_type, thread_index, epoch): super().__init__(dataset_type, thread_index) + def open(self): + super().open() + def close(self): + super().close() + + def get_sample(self, filename, sample_index): + super().get_sample(filename, sample_index) + + def next(self): + super().next() + + def read_index(self): + super().read_index() @dlp.log def read(self): folder = "valid" @@ -44,7 +57,7 @@ def read(self): folder = "train" index_folder = f"{self._args.data_folder}/index/{folder}" index_files = [] - for file in self.file_list: + for file in self._file_list: filename = os.path.basename(file) index_files.append(f"{index_folder}/{filename}.idx") logging.info( @@ -61,7 +74,7 @@ def read(self): prefetch_size = 1 if self._args.prefetch_size > 0: prefetch_size = self._args.prefetch_size - dataset = fn.readers.tfrecord(path=self.file_list, + dataset = fn.readers.tfrecord(path=self._file_list, index_path=index_files, features={ 'image': tfrec.FixedLenFeature((), tfrec.string, ""), @@ -84,7 +97,7 @@ def _preprocess(self, dataset): @dlp.log def _resize(self, dataset): - return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension]) + return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension]) @dlp.log def finalize(self): diff --git a/dlio_benchmark/reader/hdf5_reader.py b/dlio_benchmark/reader/hdf5_reader.py index f95d9b94..b8801abf 100644 --- a/dlio_benchmark/reader/hdf5_reader.py +++ b/dlio_benchmark/reader/hdf5_reader.py @@ -58,6 +58,10 @@ def next(self): def read_index(self, image_idx, step): return super().read_index(image_idx, step) + @dlp.log + def read(self): + return super().read() + @dlp.log def finalize(self): return super().finalize() diff --git a/dlio_benchmark/reader/image_reader.py b/dlio_benchmark/reader/image_reader.py index 1fe63a05..2848c868 100644 --- a/dlio_benchmark/reader/image_reader.py +++ b/dlio_benchmark/reader/image_reader.py @@ -59,6 +59,10 @@ def next(self): def read_index(self, image_idx, step): return super().read_index(image_idx, step) + @dlp.log + def read(self): + return super().read() + @dlp.log def finalize(self): return super().finalize() diff --git a/dlio_benchmark/reader/npy_reader.py b/dlio_benchmark/reader/npy_reader.py index 435df6b3..931e3355 100644 --- a/dlio_benchmark/reader/npy_reader.py +++ b/dlio_benchmark/reader/npy_reader.py @@ -57,4 +57,8 @@ def read_index(self, image_idx, step): @dlp.log def finalize(self): - return super().finalize() \ No newline at end of file + return super().finalize() + + @dlp.log + def read(self): + return super().read() \ No newline at end of file diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py index f0144f74..678ebd12 100644 --- a/dlio_benchmark/reader/npz_reader.py +++ b/dlio_benchmark/reader/npz_reader.py @@ -57,4 +57,8 @@ def read_index(self, image_idx, step): @dlp.log def finalize(self): - return super().finalize() \ No newline at end of file + return super().finalize() + + @dlp.log + def read(self): + return super().read() \ No newline at end of file diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py index c1404722..e84db142 100644 --- a/dlio_benchmark/reader/reader_factory.py +++ b/dlio_benchmark/reader/reader_factory.py @@ -59,8 +59,7 @@ def get_reader(type, dataset_type, thread_index, epoch_number): return NPYReader(dataset_type, thread_index, epoch_number) elif type == FormatType.NPZ: if _args.data_loader == DataLoaderType.NATIVE_DALI: - print("Loading data of %s format is not supported without framework data loader" %type) - raise Exception(type) + raise Exception("Loading data of %s format is not supported without framework data loader; please use npy format instead." %type) else: from dlio_benchmark.reader.npz_reader import NPZReader return NPZReader(dataset_type, thread_index, epoch_number) @@ -72,5 +71,4 @@ def get_reader(type, dataset_type, thread_index, epoch_number): from dlio_benchmark.reader.tf_reader import TFReader return TFReader(dataset_type, thread_index, epoch_number) else: - print("Loading data of %s format is not supported without framework data loader" %type) - raise Exception(type) + raise Exception("Loading data of %s format is not supported without framework data loader" %type) \ No newline at end of file diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index 8084bbdd..93dee926 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -49,6 +49,7 @@ def __init__(self, dataset_type, thread_index): FormatReader.read_images = 0 self.step = 1 self.image_idx = 0 + self._file_list = self._args.file_list_train if self.dataset_type is DatasetType.TRAIN else self._args.file_list_eval self.batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval @dlp.log @@ -123,9 +124,8 @@ def finalize(self): if filename in self.open_file_map: self.close(filename) self.open_file_map[filename] = None - + @abstractmethod def read(self): - logging.error("This method is not implemented!") return def __del__(self): diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index a889076b..56b9ca58 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -37,9 +37,7 @@ class TFReader(FormatReader): def __init__(self, dataset_type, thread_index, epoch): super().__init__(dataset_type, thread_index) self._dataset = None - self._file_list = self._args.file_list_train if self.dataset_type is DatasetType.TRAIN else self._args.file_list_eval - self.batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval - @dlp.log + @dlp.log def open(self, filename): pass @@ -105,3 +103,7 @@ def read_index(self, image_idx, step): @dlp.log def finalize(self): return super().finalize() + + @dlp.log + def read(self): + return super().read() \ No newline at end of file From 7312c0143c42b36fb2781675367c678ae318c02c Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 21:15:45 +0000 Subject: [PATCH 21/36] added dont_use_mmap --- dlio_benchmark/reader/dali_image_reader.py | 3 ++- dlio_benchmark/reader/dali_npy_reader.py | 3 ++- dlio_benchmark/reader/dali_tfrecord_reader.py | 3 ++- dlio_benchmark/utils/config.py | 3 +++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py index 12c2f1da..54bab5fd 100644 --- a/dlio_benchmark/reader/dali_image_reader.py +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -74,7 +74,8 @@ def read(self): prefetch_queue_depth=prefetch_size, initial_fill=initial_fill, random_shuffle=random_shuffle, shuffle_after_epoch=seed_change_epoch, - stick_to_shard=stick_to_shard, pad_last_batch=True) + stick_to_shard=stick_to_shard, pad_last_batch=True, + dont_use_mmap=self._args.dont_use_mmap) dataset = fn.decoders.image(images, device='cpu') dataset = self._preprocess(images) dataset = self._resize(images) diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py index c2d3bf4f..b7b9947a 100644 --- a/dlio_benchmark/reader/dali_npy_reader.py +++ b/dlio_benchmark/reader/dali_npy_reader.py @@ -78,7 +78,8 @@ def read(self): dataset = fn.readers.numpy(device='cpu', files=self._file_list, num_shards=self._args.comm_size, prefetch_queue_depth=prefetch_size, initial_fill=initial_fill, random_shuffle=random_shuffle, seed=seed, shuffle_after_epoch=seed_change_epoch, - stick_to_shard=stick_to_shard, pad_last_batch=True) + stick_to_shard=stick_to_shard, pad_last_batch=True, + dont_use_mmap=self._args.dont_use_mmap) dataset = self._preprocess(dataset) dataset = self._resize(dataset) diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py index 856d8bfd..01b20c7d 100644 --- a/dlio_benchmark/reader/dali_tfrecord_reader.py +++ b/dlio_benchmark/reader/dali_tfrecord_reader.py @@ -83,7 +83,8 @@ def read(self): prefetch_queue_depth=prefetch_size, initial_fill=initial_fill, random_shuffle=random_shuffle, seed=seed, - stick_to_shard=True, pad_last_batch=True) + stick_to_shard=True, pad_last_batch=True, + dont_use_mmap=self._args.dont_use_mmap) dataset = self._preprocess(dataset["image"]) dataset = self._resize(dataset) return dataset diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py index 05bcd400..eebf8525 100644 --- a/dlio_benchmark/utils/config.py +++ b/dlio_benchmark/utils/config.py @@ -78,6 +78,7 @@ class ConfigArguments: steps_between_checkpoints: int = -1 transfer_size: int = None read_threads: int = 1 + dont_use_mmap: bool = False computation_threads: int = 1 computation_time: float = 0. computation_time_stdev: float = 0. @@ -349,6 +350,8 @@ def LoadConfig(args, config): elif 'reader' in config: reader = config['reader'] if reader is not None: + if 'dont_use_mmap' in reader: + args.dont_use_mmap = reader['dont_use_mmap'] if 'reader_classname' in reader: args.reader_classname = reader['reader_classname'] if 'multiprocessing_context' in reader: From 02c3855b784f7e044b0e40d8089b870ac78854cc Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 21:23:25 +0000 Subject: [PATCH 22/36] fixed indent --- dlio_benchmark/reader/tf_reader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index 56b9ca58..f40dbdcc 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -37,7 +37,8 @@ class TFReader(FormatReader): def __init__(self, dataset_type, thread_index, epoch): super().__init__(dataset_type, thread_index) self._dataset = None - @dlp.log + + @dlp.log def open(self, filename): pass From 60c508c59a38003ee2ebbfd22ffc9c5f1aa1d02d Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 21:26:17 +0000 Subject: [PATCH 23/36] fixed csvreader --- dlio_benchmark/reader/csv_reader.py | 4 ++++ dlio_benchmark/reader/reader_handler.py | 1 + 2 files changed, 5 insertions(+) diff --git a/dlio_benchmark/reader/csv_reader.py b/dlio_benchmark/reader/csv_reader.py index be1e587b..cea1cd17 100644 --- a/dlio_benchmark/reader/csv_reader.py +++ b/dlio_benchmark/reader/csv_reader.py @@ -58,3 +58,7 @@ def read_index(self, image_idx, step): @dlp.log def finalize(self): return super().finalize() + + @dlp.log + def read(self): + return super().read() \ No newline at end of file diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index 93dee926..3ecd79c2 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -124,6 +124,7 @@ def finalize(self): if filename in self.open_file_map: self.close(filename) self.open_file_map[filename] = None + @abstractmethod def read(self): return From 5e96841793dfecefb4fd9f951e627b40b7c8227d Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 21:41:10 +0000 Subject: [PATCH 24/36] native_dali test with npy format instead of npz --- .github/workflows/python-package-conda.yml | 12 ++--- .../data_generator/npy_generator.py | 53 +++++++++++++++++++ 2 files changed, 59 insertions(+), 6 deletions(-) create mode 100644 dlio_benchmark/data_generator/npy_generator.py diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index b649696c..5ac1ed5d 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -154,16 +154,16 @@ jobs: source ${VENV}/bin/activate mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - - name: test-torch-native-dali-loader-npz + - name: test-torch-native-dali-loader-npy run: | source ${VENV}/bin/activate - mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - - name: test-tf-native-dali-loader-npz + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.dataset.format=npy ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.dataset.format=npy ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + - name: test-tf-native-dali-loader-npy run: | source ${VENV}/bin/activate - mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.dataset.format=npy ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 + mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.dataset.format=npy ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0 - name: test_subset run: | source ${VENV}/bin/activate diff --git a/dlio_benchmark/data_generator/npy_generator.py b/dlio_benchmark/data_generator/npy_generator.py new file mode 100644 index 00000000..d60ec2f3 --- /dev/null +++ b/dlio_benchmark/data_generator/npy_generator.py @@ -0,0 +1,53 @@ +""" + Copyright (c) 2022, UChicago Argonne, LLC + All Rights Reserved + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" + +from dlio_benchmark.common.enumerations import Compression +from dlio_benchmark.data_generator.data_generator import DataGenerator + +import logging +import numpy as np + +from dlio_benchmark.utils.utility import progress, utcnow +from dlio_profiler.logger import fn_interceptor as Profile +from shutil import copyfile +from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR + +dlp = Profile(MODULE_DATA_GENERATOR) + +""" +Generator for creating data in NPZ format. +""" +class NPYGenerator(DataGenerator): + def __init__(self): + super().__init__() + + @dlp.log + def generate(self): + """ + Generator for creating data in NPY format of 3d dataset. + """ + super().generate() + np.random.seed(10) + record_labels = [0] * self.num_samples + for i in dlp.iter(range(self.my_rank, int(self.total_files_to_generate), self.comm_size)): + dim1, dim2 = self.get_dimension() + records = np.random.randint(255, size=(dim1, dim2, self.num_samples), dtype=np.uint8) + out_path_spec = self.storage.get_uri(self._file_list[i]) + progress(i+1, self.total_files_to_generate, "Generating NPY Data") + prev_out_spec = out_path_spec + np.save(out_path_spec, records) + np.random.seed() From b1412e306da96895233e7f8b03de1a5b14eb34c9 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 21:53:23 +0000 Subject: [PATCH 25/36] fixed issue of enum --- dlio_benchmark/common/enumerations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py index 60ba5f63..9fab8f23 100644 --- a/dlio_benchmark/common/enumerations.py +++ b/dlio_benchmark/common/enumerations.py @@ -101,7 +101,7 @@ class FormatType(Enum): def __str__(self): return self.value - @ staticmethod + @staticmethod def get_enum(value): if FormatType.TFRECORD.value == value: return FormatType.TFRECORD From d659e5fcafdfa536875e9af2cc77b8d0c9028c53 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 21:57:56 +0000 Subject: [PATCH 26/36] modify action so that dlio will always be installed --- .github/workflows/python-package-conda.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 5ac1ed5d..6f584b8f 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -50,7 +50,6 @@ jobs: sudo apt-get install $CC $CXX libc6 sudo apt-get install mpich libhwloc-dev - name: Install DLIO - if: steps.cache-modules.outputs.cache-hit != 'true' run: | echo "Profiler ${DLIO_PROFILER} gcc $CC" python -m pip install --upgrade pip From 96fa9c3cff539e673a705c42a8fff7dc54f160fb Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Mon, 11 Dec 2023 23:32:19 +0000 Subject: [PATCH 27/36] [skip ci] added documentation for dali --- README.md | 2 +- docs/source/config.rst | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3ae53fbb..5990ff57 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ dlio_benchmark ++workload.workflow.generate_data=True git clone https://github.com/argonne-lcf/dlio_benchmark cd dlio_benchmark/ pip install .[dlio_profiler] - +``` ## Container ```bash diff --git a/docs/source/config.rst b/docs/source/config.rst index 920b6590..9d245ca2 100644 --- a/docs/source/config.rst +++ b/docs/source/config.rst @@ -189,7 +189,7 @@ reader - Description * - data_loader - tensorflow - - select the data loader to use [tensorflow|pytorch|dali]. + - select the data loader to use [tensorflow|pytorch|dali|native_dali]. * - batch_size - 1 - batch size for training @@ -227,6 +227,8 @@ reader For pytorch, ``prefetch_size`` is set to be 0, it will be changed to 2. In other words, the default value for ``prefetch_size`` in pytorch is 2. + For Dali data loader, we support two options, ``dali`` and ``native_dali```. ``dali`` uses our internal reader, such as ``jpeg_reader``, ``hdf5_reader``, etc, and ``dali.fn.external_source``; whereas ``native_dali`` directly uses Dali readers, such as ``dn.readers.numpy``, ``fn.readers.tfrecord``, and ``fn.readers.file``. + train ------------------ From f9aaac28a93e9984261c93f5c0bf498dd3489fd6 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 12 Dec 2023 16:19:38 +0000 Subject: [PATCH 28/36] removed read; and define it as pipeline --- .../data_loader/native_dali_data_loader.py | 6 +-- dlio_benchmark/reader/csv_reader.py | 6 +-- dlio_benchmark/reader/dali_image_reader.py | 21 +++++---- dlio_benchmark/reader/dali_npy_reader.py | 44 +++++++++---------- dlio_benchmark/reader/dali_tfrecord_reader.py | 33 +++++++------- dlio_benchmark/reader/hdf5_reader.py | 13 ++---- dlio_benchmark/reader/image_reader.py | 4 -- dlio_benchmark/reader/npy_reader.py | 6 +-- dlio_benchmark/reader/npz_reader.py | 6 +-- dlio_benchmark/reader/reader_handler.py | 6 +-- dlio_benchmark/reader/tf_reader.py | 10 ++--- 11 files changed, 60 insertions(+), 95 deletions(-) diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py index 8799800b..900c8c6d 100644 --- a/dlio_benchmark/data_loader/native_dali_data_loader.py +++ b/dlio_benchmark/data_loader/native_dali_data_loader.py @@ -38,9 +38,9 @@ def read(self): exec_async=False, exec_pipelined=False) with pipeline: images = ReaderFactory.get_reader(type=self.format_type, - dataset_type=self.dataset_type, - thread_index=-1, - epoch_number=self.epoch_number).read() + dataset_type=self.dataset_type, + thread_index=-1, + epoch_number=self.epoch_number).pipeline() pipeline.set_outputs(images) self.pipelines.append(pipeline) diff --git a/dlio_benchmark/reader/csv_reader.py b/dlio_benchmark/reader/csv_reader.py index cea1cd17..0690fc69 100644 --- a/dlio_benchmark/reader/csv_reader.py +++ b/dlio_benchmark/reader/csv_reader.py @@ -57,8 +57,4 @@ def read_index(self, image_idx, step): @dlp.log def finalize(self): - return super().finalize() - - @dlp.log - def read(self): - return super().read() \ No newline at end of file + return super().finalize() \ No newline at end of file diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py index 54bab5fd..02f0756b 100644 --- a/dlio_benchmark/reader/dali_image_reader.py +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -16,7 +16,8 @@ """ import math import logging -from time import time +from time import time, sleep +import numpy as np import nvidia.dali.fn as fn from dlio_benchmark.common.constants import MODULE_DATA_READER @@ -33,9 +34,11 @@ class DaliImageReader(FormatReader): @dlp.log_init def __init__(self, dataset_type, thread_index, epoch): super().__init__(dataset_type, thread_index) - def open(self): - super().open() + @dlp.log + def open(self, filename): + super().open(filename) + def close(self): super().close() @@ -49,7 +52,7 @@ def read_index(self): super().read_index() @dlp.log - def read(self): + def pipeline(self): logging.debug( f"{utcnow()} Reading {len(self._file_list)} files rank {self._args.my_rank}") random_shuffle = False @@ -76,17 +79,13 @@ def read(self): shuffle_after_epoch=seed_change_epoch, stick_to_shard=stick_to_shard, pad_last_batch=True, dont_use_mmap=self._args.dont_use_mmap) - dataset = fn.decoders.image(images, device='cpu') - dataset = self._preprocess(images) + images = fn.decoders.image(images, device='cpu') dataset = self._resize(images) return dataset @dlp.log - def _preprocess(self, dataset): - if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.: - t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev) - sleep(max(t, 0.0)) - return dataset + def preprocess(self): + raise Exception("Emulated preprocessing method is not implemented in dali readers") @dlp.log def _resize(self, dataset): diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py index b7b9947a..00ceada6 100644 --- a/dlio_benchmark/reader/dali_npy_reader.py +++ b/dlio_benchmark/reader/dali_npy_reader.py @@ -16,7 +16,8 @@ """ import math import logging -from time import time +from time import time, sleep +import numpy as np import nvidia.dali.fn as fn from dlio_benchmark.common.constants import MODULE_DATA_READER @@ -34,23 +35,12 @@ class DaliNPYReader(FormatReader): def __init__(self, dataset_type, thread_index, epoch): super().__init__(dataset_type, thread_index) - def open(self): - super().open() - - def close(self): - super().close() - - def get_sample(self, filename, sample_index): - super().get_sample(filename, sample_index) - - def next(self): - super().next() - - def read_index(self): - super().read_index() + @dlp.log + def open(self, filename): + super().open(filename) @dlp.log - def read(self): + def pipeline(self): logging.debug( f"{utcnow()} Reading {len(self._file_list)} files rank {self._args.my_rank}") random_shuffle = False @@ -74,24 +64,30 @@ def read(self): if seed_change_epoch: stick_to_shard = False - dataset = fn.readers.numpy(device='cpu', files=self._file_list, num_shards=self._args.comm_size, prefetch_queue_depth=prefetch_size, initial_fill=initial_fill, random_shuffle=random_shuffle, seed=seed, shuffle_after_epoch=seed_change_epoch, stick_to_shard=stick_to_shard, pad_last_batch=True, dont_use_mmap=self._args.dont_use_mmap) - - dataset = self._preprocess(dataset) dataset = self._resize(dataset) return dataset + + def close(self): + super().close() + def get_sample(self, filename, sample_index): + super().get_sample(filename, sample_index) + + def next(self): + super().next() + + def read_index(self): + super().read_index() @dlp.log - def _preprocess(self, dataset): - if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.: - t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev) - sleep(max(t, 0.0)) - return dataset + def preprocess(self, dataset): + raise Exception("Emulated preprocessing method is not implemented in dali readers") + @dlp.log def _resize(self, dataset): diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py index 01b20c7d..e7de0d32 100644 --- a/dlio_benchmark/reader/dali_tfrecord_reader.py +++ b/dlio_benchmark/reader/dali_tfrecord_reader.py @@ -18,7 +18,8 @@ import math import logging -from time import time +from time import time, sleep +import numpy as np import nvidia import nvidia.dali.fn as fn @@ -33,25 +34,25 @@ class DaliTFRecordReader(FormatReader): + """ + Reader for NPZ files + """ @dlp.log_init def __init__(self, dataset_type, thread_index, epoch): super().__init__(dataset_type, thread_index) - def open(self): - super().open() + @dlp.log + def open(self, filename): + super().open(filename) + def close(self): super().close() def get_sample(self, filename, sample_index): super().get_sample(filename, sample_index) - def next(self): - super().next() - - def read_index(self): - super().read_index() @dlp.log - def read(self): + def pipeline(self): folder = "valid" if self.dataset_type == DatasetType.TRAIN: folder = "train" @@ -85,16 +86,16 @@ def read(self): random_shuffle=random_shuffle, seed=seed, stick_to_shard=True, pad_last_batch=True, dont_use_mmap=self._args.dont_use_mmap) - dataset = self._preprocess(dataset["image"]) - dataset = self._resize(dataset) + dataset = self._resize(dataset['image']) return dataset + def read_index(self): + super().read_index() + raise Exception("read_index method is not implemented") + @dlp.log - def _preprocess(self, dataset): - if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.: - t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev) - sleep(max(t, 0.0)) - return dataset + def preprocess(self, dataset): + raise Exception("Emulated preprocessing method is not implemented in dali readers") @dlp.log def _resize(self, dataset): diff --git a/dlio_benchmark/reader/hdf5_reader.py b/dlio_benchmark/reader/hdf5_reader.py index b8801abf..77dd7301 100644 --- a/dlio_benchmark/reader/hdf5_reader.py +++ b/dlio_benchmark/reader/hdf5_reader.py @@ -24,13 +24,10 @@ dlp = Profile(MODULE_DATA_READER) -""" -Reader for HDF5 files for training file. -""" - - class HDF5Reader(FormatReader): - + """ + Reader for HDF5 files. + """ @dlp.log_init def __init__(self, dataset_type, thread_index, epoch): super().__init__(dataset_type, thread_index) @@ -58,10 +55,6 @@ def next(self): def read_index(self, image_idx, step): return super().read_index(image_idx, step) - @dlp.log - def read(self): - return super().read() - @dlp.log def finalize(self): return super().finalize() diff --git a/dlio_benchmark/reader/image_reader.py b/dlio_benchmark/reader/image_reader.py index 2848c868..1fe63a05 100644 --- a/dlio_benchmark/reader/image_reader.py +++ b/dlio_benchmark/reader/image_reader.py @@ -59,10 +59,6 @@ def next(self): def read_index(self, image_idx, step): return super().read_index(image_idx, step) - @dlp.log - def read(self): - return super().read() - @dlp.log def finalize(self): return super().finalize() diff --git a/dlio_benchmark/reader/npy_reader.py b/dlio_benchmark/reader/npy_reader.py index 931e3355..d3cc46f1 100644 --- a/dlio_benchmark/reader/npy_reader.py +++ b/dlio_benchmark/reader/npy_reader.py @@ -25,7 +25,7 @@ class NPYReader(FormatReader): """ - Reader for NPZ files + Reader for NPY files """ @dlp.log_init @@ -58,7 +58,3 @@ def read_index(self, image_idx, step): @dlp.log def finalize(self): return super().finalize() - - @dlp.log - def read(self): - return super().read() \ No newline at end of file diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py index 678ebd12..f0144f74 100644 --- a/dlio_benchmark/reader/npz_reader.py +++ b/dlio_benchmark/reader/npz_reader.py @@ -57,8 +57,4 @@ def read_index(self, image_idx, step): @dlp.log def finalize(self): - return super().finalize() - - @dlp.log - def read(self): - return super().read() \ No newline at end of file + return super().finalize() \ No newline at end of file diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index 3ecd79c2..0c1c41ae 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -60,7 +60,7 @@ def preprocess(self): @abstractmethod def open(self, filename): - pass + return @abstractmethod def close(self, filename): @@ -125,10 +125,6 @@ def finalize(self): self.close(filename) self.open_file_map[filename] = None - @abstractmethod - def read(self): - return - def __del__(self): self.thread_index = None self._args = None diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py index f40dbdcc..d19ea96a 100644 --- a/dlio_benchmark/reader/tf_reader.py +++ b/dlio_benchmark/reader/tf_reader.py @@ -55,7 +55,7 @@ def resize_sample(self, filename, sample_index): pass @dlp.log - def parse_image(self, serialized): + def _parse_image(self, serialized): """ performs deserialization of the tfrecord. :param serialized: is the serialized version using protobuf @@ -85,7 +85,7 @@ def next(self): self._dataset = tf.data.TFRecordDataset(filenames=self._file_list, buffer_size=self._args.transfer_size) self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank) self._dataset = self._dataset.map( - lambda x: tf.py_function(func=self.parse_image, inp=[x], Tout=[tf.uint8]) + lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]) , num_parallel_calls=self._args.computation_threads) self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True) total = math.ceil(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file) @@ -103,8 +103,4 @@ def read_index(self, image_idx, step): @dlp.log def finalize(self): - return super().finalize() - - @dlp.log - def read(self): - return super().read() \ No newline at end of file + return super().finalize() \ No newline at end of file From ca760fe175a46731d39c7645e6e4995b64ec925d Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Tue, 12 Dec 2023 16:22:25 +0000 Subject: [PATCH 29/36] added exceptions for unimplemented methods --- dlio_benchmark/reader/dali_image_reader.py | 3 +++ dlio_benchmark/reader/dali_npy_reader.py | 5 ++++- dlio_benchmark/reader/dali_tfrecord_reader.py | 13 +++++++++---- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py index 02f0756b..34c9bf7b 100644 --- a/dlio_benchmark/reader/dali_image_reader.py +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -44,12 +44,15 @@ def close(self): def get_sample(self, filename, sample_index): super().get_sample(filename, sample_index) + raise Exception("get sample method is not implemented in dali readers") def next(self): super().next() + raise Exception("next method is not implemented in dali readers") def read_index(self): super().read_index() + raise Exception("read_index method is not implemented in dali readers") @dlp.log def pipeline(self): diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py index 00ceada6..b69ea602 100644 --- a/dlio_benchmark/reader/dali_npy_reader.py +++ b/dlio_benchmark/reader/dali_npy_reader.py @@ -77,12 +77,15 @@ def close(self): def get_sample(self, filename, sample_index): super().get_sample(filename, sample_index) + raise Exception("get sample method is not implemented in dali readers") def next(self): super().next() + raise Exception("next method is not implemented in dali readers") def read_index(self): - super().read_index() + super().read_index() + raise Exception("read_index method is not implemented in dali readers") @dlp.log def preprocess(self, dataset): diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py index e7de0d32..dfb3c23e 100644 --- a/dlio_benchmark/reader/dali_tfrecord_reader.py +++ b/dlio_benchmark/reader/dali_tfrecord_reader.py @@ -47,9 +47,6 @@ def open(self, filename): def close(self): super().close() - - def get_sample(self, filename, sample_index): - super().get_sample(filename, sample_index) @dlp.log def pipeline(self): @@ -89,9 +86,17 @@ def pipeline(self): dataset = self._resize(dataset['image']) return dataset + def get_sample(self, filename, sample_index): + super().get_sample(filename, sample_index) + raise Exception("get sample method is not implemented in dali readers") + + def next(self): + super().next() + raise Exception("next method is not implemented in dali readers") + def read_index(self): super().read_index() - raise Exception("read_index method is not implemented") + raise Exception("read_index method is not implemented in dali readers") @dlp.log def preprocess(self, dataset): From a8ba464968c626836e67f134b2e608c6a75c54f7 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Dec 2023 00:09:40 +0000 Subject: [PATCH 30/36] added preprocessing --- dlio_benchmark/reader/dali_image_reader.py | 5 +---- dlio_benchmark/reader/dali_npy_reader.py | 8 ++------ dlio_benchmark/reader/dali_tfrecord_reader.py | 5 +---- dlio_benchmark/reader/reader_handler.py | 2 +- 4 files changed, 5 insertions(+), 15 deletions(-) diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py index 34c9bf7b..032885cd 100644 --- a/dlio_benchmark/reader/dali_image_reader.py +++ b/dlio_benchmark/reader/dali_image_reader.py @@ -83,13 +83,10 @@ def pipeline(self): stick_to_shard=stick_to_shard, pad_last_batch=True, dont_use_mmap=self._args.dont_use_mmap) images = fn.decoders.image(images, device='cpu') + fn.python_function(dataset, function=self.preprocess, num_outputs=0) dataset = self._resize(images) return dataset - @dlp.log - def preprocess(self): - raise Exception("Emulated preprocessing method is not implemented in dali readers") - @dlp.log def _resize(self, dataset): return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension]) diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py index b69ea602..e68f4fb2 100644 --- a/dlio_benchmark/reader/dali_npy_reader.py +++ b/dlio_benchmark/reader/dali_npy_reader.py @@ -70,6 +70,7 @@ def pipeline(self): stick_to_shard=stick_to_shard, pad_last_batch=True, dont_use_mmap=self._args.dont_use_mmap) dataset = self._resize(dataset) + fn.python_function(dataset, function=self.preprocess, num_outputs=0) return dataset def close(self): @@ -86,12 +87,7 @@ def next(self): def read_index(self): super().read_index() raise Exception("read_index method is not implemented in dali readers") - - @dlp.log - def preprocess(self, dataset): - raise Exception("Emulated preprocessing method is not implemented in dali readers") - - + @dlp.log def _resize(self, dataset): return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension]) diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py index dfb3c23e..0a33f327 100644 --- a/dlio_benchmark/reader/dali_tfrecord_reader.py +++ b/dlio_benchmark/reader/dali_tfrecord_reader.py @@ -84,6 +84,7 @@ def pipeline(self): stick_to_shard=True, pad_last_batch=True, dont_use_mmap=self._args.dont_use_mmap) dataset = self._resize(dataset['image']) + fn.python_function(dataset, function=self.preprocess, num_outputs=0) return dataset def get_sample(self, filename, sample_index): @@ -98,10 +99,6 @@ def read_index(self): super().read_index() raise Exception("read_index method is not implemented in dali readers") - @dlp.log - def preprocess(self, dataset): - raise Exception("Emulated preprocessing method is not implemented in dali readers") - @dlp.log def _resize(self, dataset): return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension]) diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py index 0c1c41ae..57884dd5 100644 --- a/dlio_benchmark/reader/reader_handler.py +++ b/dlio_benchmark/reader/reader_handler.py @@ -53,7 +53,7 @@ def __init__(self, dataset_type, thread_index): self.batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval @dlp.log - def preprocess(self): + def preprocess(self, a=None): if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.: t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev) sleep(max(t, 0.0)) From 1f0315968369e63aa29cb6e76ac99a3c1bd765a3 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Dec 2023 15:54:02 +0000 Subject: [PATCH 31/36] conditional cache for DLIO installation --- .github/workflows/python-package-conda.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 6f584b8f..8c53dd0f 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -50,6 +50,7 @@ jobs: sudo apt-get install $CC $CXX libc6 sudo apt-get install mpich libhwloc-dev - name: Install DLIO + if: steps.cache-modules.outputs.cache-hit != 'true' run: | echo "Profiler ${DLIO_PROFILER} gcc $CC" python -m pip install --upgrade pip @@ -57,7 +58,12 @@ jobs: python -m venv ${VENV} source ${VENV}/bin/activate pip install .[test] - rm -rf dlio_benchmark + if: steps.cache-modules.outputs.cache-hit == 'true' + run: | + source ${VENV}/bin/activate + rm -rf *.egg* + rm -rf build + python setup.py install - name: Install DLIO Profiler run: | echo "Profiler ${DLIO_PROFILER} gcc $CC" From 5dd6ebfbda09b09dd8d31f3184574968021377f4 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Dec 2023 15:57:26 +0000 Subject: [PATCH 32/36] fixed bugs --- .github/workflows/python-package-conda.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 8c53dd0f..095573e7 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -49,6 +49,13 @@ jobs: sudo apt update sudo apt-get install $CC $CXX libc6 sudo apt-get install mpich libhwloc-dev + - name: Install DLIO code only + if: steps.cache-modules.outputs.cache-hit == 'true' + run: | + source ${VENV}/bin/activate + rm -rf *.egg* + rm -rf build + python setup.py install - name: Install DLIO if: steps.cache-modules.outputs.cache-hit != 'true' run: | @@ -57,13 +64,7 @@ jobs: pip install virtualenv python -m venv ${VENV} source ${VENV}/bin/activate - pip install .[test] - if: steps.cache-modules.outputs.cache-hit == 'true' - run: | - source ${VENV}/bin/activate - rm -rf *.egg* - rm -rf build - python setup.py install + pip install .[test] - name: Install DLIO Profiler run: | echo "Profiler ${DLIO_PROFILER} gcc $CC" From 6af315b29fefd8883c45b5ff518c241aa852b700 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Dec 2023 16:06:23 +0000 Subject: [PATCH 33/36] fixed bugs --- .github/workflows/python-package-conda.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 095573e7..d8314b73 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -47,14 +47,16 @@ jobs: - name: Install System Tools run: | sudo apt update - sudo apt-get install $CC $CXX libc6 + sudo apt-get install $CC $CXX libc6 git sudo apt-get install mpich libhwloc-dev - name: Install DLIO code only if: steps.cache-modules.outputs.cache-hit == 'true' run: | + git pull source ${VENV}/bin/activate rm -rf *.egg* rm -rf build + python setup.py build python setup.py install - name: Install DLIO if: steps.cache-modules.outputs.cache-hit != 'true' From 8dc2b71ed8956ba7768d91f10dc1c1d3d57de225 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Dec 2023 16:08:28 +0000 Subject: [PATCH 34/36] fixed bugs --- .github/workflows/python-package-conda.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index d8314b73..2c4dd4a0 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -52,7 +52,6 @@ jobs: - name: Install DLIO code only if: steps.cache-modules.outputs.cache-hit == 'true' run: | - git pull source ${VENV}/bin/activate rm -rf *.egg* rm -rf build From 7e72ed5cdbc51496cc9e90aa9d8a118e5db66f56 Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Dec 2023 16:24:52 +0000 Subject: [PATCH 35/36] fixing again --- .github/workflows/python-package-conda.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 2c4dd4a0..7b78713d 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -55,6 +55,8 @@ jobs: source ${VENV}/bin/activate rm -rf *.egg* rm -rf build + rm -rf dist + pip uninstall dlio_benchmark python setup.py build python setup.py install - name: Install DLIO From 4e727a85c0b5f1cb86efb2a50ef36322f41e885c Mon Sep 17 00:00:00 2001 From: Huihuo Zheng Date: Thu, 14 Dec 2023 19:52:44 +0000 Subject: [PATCH 36/36] tests again --- .github/workflows/python-package-conda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 7b78713d..40dd2091 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -56,7 +56,7 @@ jobs: rm -rf *.egg* rm -rf build rm -rf dist - pip uninstall dlio_benchmark + pip uninstall -y dlio_benchmark python setup.py build python setup.py install - name: Install DLIO