From 00a253ddd82c916cbc8b1b7ab7c1468d20bd2ded Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Wed, 22 Nov 2023 04:18:22 +0000
Subject: [PATCH 01/36] fixed readthedoc build issue

---
 .readthedocs.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 5fc242ca..092a6b2b 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -30,6 +30,6 @@ sphinx:
 # Optional but recommended, declare the Python requirements required
 # to build your documentation
 # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
-# python:
-#    install:
-#    - requirements: docs/requirements.txt
\ No newline at end of file
+python:
+    install:
+    - requirements: docs/requirements.txt
\ No newline at end of file

From 256cecfe1f0feeb0823484a0afede5a87fe07264 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Fri, 1 Dec 2023 20:24:56 +0000
Subject: [PATCH 02/36] partial merged the following PR:
 https://github.com/argonne-lcf/dlio_benchmark/pull/81

---
 dlio_benchmark/common/enumerations.py         |  1 +
 .../data_loader/data_loader_factory.py        |  3 +
 .../data_loader/native_dali_data_loader.py    | 60 ++++++++++++++
 dlio_benchmark/reader/dali_image_reader.py    | 69 ++++++++++++++++
 dlio_benchmark/reader/dali_npz_reader.py      | 68 ++++++++++++++++
 dlio_benchmark/reader/dali_tfrecord_reader.py | 78 +++++++++++++++++++
 .../reader/{png_reader.py => image_reader.py} |  4 +-
 dlio_benchmark/reader/jpeg_reader.py          | 62 ---------------
 dlio_benchmark/reader/npz_reader.py           | 60 --------------
 dlio_benchmark/reader/reader_factory.py       | 26 ++++---
 10 files changed, 297 insertions(+), 134 deletions(-)
 create mode 100644 dlio_benchmark/data_loader/native_dali_data_loader.py
 create mode 100644 dlio_benchmark/reader/dali_image_reader.py
 create mode 100644 dlio_benchmark/reader/dali_npz_reader.py
 create mode 100644 dlio_benchmark/reader/dali_tfrecord_reader.py
 rename dlio_benchmark/reader/{png_reader.py => image_reader.py} (96%)
 delete mode 100644 dlio_benchmark/reader/jpeg_reader.py
 delete mode 100644 dlio_benchmark/reader/npz_reader.py

diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py
index 64227772..0081195e 100644
--- a/dlio_benchmark/common/enumerations.py
+++ b/dlio_benchmark/common/enumerations.py
@@ -124,6 +124,7 @@ class DataLoaderType(Enum):
     TENSORFLOW='tensorflow'
     PYTORCH='pytorch'
     DALI='dali'
+    NATIVE_DALI = 'native_dali'
     CUSTOM='custom'
     NONE='none'
     
diff --git a/dlio_benchmark/data_loader/data_loader_factory.py b/dlio_benchmark/data_loader/data_loader_factory.py
index e8457450..13bf16b0 100644
--- a/dlio_benchmark/data_loader/data_loader_factory.py
+++ b/dlio_benchmark/data_loader/data_loader_factory.py
@@ -45,6 +45,9 @@ def get_loader(type, format_type, dataset_type, epoch):
         elif type == DataLoaderType.DALI:
             from dlio_benchmark.data_loader.dali_data_loader import DaliDataLoader
             return DaliDataLoader(format_type, dataset_type, epoch)
+        elif type == DataLoaderType.NATIVE_DALI:
+            from dlio_benchmark.data_loader.native_dali_data_loader import NativeDaliDataLoader
+            return NativeDaliDataLoader(format_type, dataset_type, epoch)
         else:
             print("Data Loader %s not supported or plugins not found" % type)
             raise Exception(str(ErrorCodes.EC1004))
diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py
new file mode 100644
index 00000000..2df04a5e
--- /dev/null
+++ b/dlio_benchmark/data_loader/native_dali_data_loader.py
@@ -0,0 +1,60 @@
+from time import time
+import logging
+import math
+import numpy as np
+from nvidia.dali.pipeline import Pipeline
+import nvidia.dali.fn as fn
+import nvidia.dali.types as types
+import nvidia.dali as dali
+from nvidia.dali.plugin.pytorch import DALIGenericIterator
+
+from dlio_benchmark.common.constants import MODULE_DATA_LOADER
+from dlio_benchmark.common.enumerations import Shuffle, DataLoaderType, DatasetType
+from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
+from dlio_benchmark.reader.reader_factory import ReaderFactory
+from dlio_benchmark.utils.utility import utcnow, get_rank, timeit, Profile
+
+dlp = Profile(MODULE_DATA_LOADER)
+
+
+class NativeDaliDataLoader(BaseDataLoader):
+    @dlp.log_init
+    def __init__(self, format_type, dataset_type, epoch):
+        super().__init__(format_type, dataset_type, epoch, DataLoaderType.NATIVE_DALI)
+        self.pipelines = []
+
+    @dlp.log
+    def read(self):
+        num_samples = self._args.total_samples_train if self.dataset_type is DatasetType.TRAIN else self._args.total_samples_eval
+        batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval
+        parallel = True if self._args.read_threads > 0 else False
+        self.pipelines = []
+        num_threads = 1
+        if self._args.read_threads > 0:
+            num_threads = self._args.read_threads
+        # None executes pipeline on CPU and the reader does the batching
+        pipeline = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=None, py_num_workers=num_threads,
+                               exec_async=False, exec_pipelined=False)
+        with pipeline:
+            images = ReaderFactory.get_reader(type=self.format_type,
+                                                      dataset_type=self.dataset_type,
+                                                      thread_index=-1,
+                                                      epoch_number=self.epoch_number).read()
+            pipeline.set_outputs(images)
+        self.pipelines.append(pipeline)
+        logging.info(f"{utcnow()} Creating {num_threads} pipelines by {self._args.my_rank} rank ")
+
+    @dlp.log
+    def next(self):
+        super().next()
+        num_samples = self._args.total_samples_train if self.dataset_type is DatasetType.TRAIN else self._args.total_samples_eval
+        batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval
+        for step in range(num_samples // batch_size):
+            _dataset = DALIGenericIterator(self.pipelines, ['data'])
+            for batch in _dataset:
+                logging.info(f"{utcnow()} Creating {len(batch)} batches by {self._args.my_rank} rank ")
+                yield batch
+
+    @dlp.log
+    def finalize(self):
+        pass
diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
new file mode 100644
index 00000000..adbc2c55
--- /dev/null
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -0,0 +1,69 @@
+"""
+   Copyright (c) 2022, UChicago Argonne, LLC
+   All Rights Reserved
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+import math
+import logging
+from time import time
+
+import nvidia.dali.fn as fn
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.reader.dali_base_reader import DaliBaseReader
+from dlio_benchmark.reader.tf_base_reader import TFBaseReader
+from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile
+from dlio_benchmark.common.enumerations import DatasetType, Shuffle
+import nvidia.dali.tfrecord as tfrec
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+class DaliImageReader(DaliBaseReader):
+    @dlp.log_init
+    def __init__(self, dataset_type):
+        super().__init__(dataset_type)
+
+    @dlp.log
+    def _load(self):
+        logging.debug(
+            f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}")
+        random_shuffle = False
+        seed = -1
+        seed_change_epoch = False
+        if self._args.sample_shuffle is not Shuffle.OFF:
+            if self._args.sample_shuffle is not Shuffle.SEED:
+                seed = self._args.seed
+            random_shuffle = True
+            seed_change_epoch = True
+        initial_fill = 1024
+        if self._args.shuffle_size > 0:
+            initial_fill = self._args.shuffle_size
+        prefetch_size = 1
+        if self._args.prefetch_size > 0:
+            prefetch_size = self._args.prefetch_size
+
+        stick_to_shard = True
+        if seed_change_epoch:
+            stick_to_shard = False
+        images, labels = fn.readers.file(files=files, num_shards=self._args.comm_size, 
+                                         prefetch_queue_depth=prefetch_size, 
+                                         initial_fill=initial_fill, random_shuffle=random_shuffle, 
+                                         shuffle_after_epoch=seed_change_epoch, 
+                                         stick_to_shard=stick_to_shard, pad_last_batch=True)
+        dataset = fn.decoders.image(jpegs, device='cpu')
+        return dataset
+
+    @dlp.log
+    def finalize(self):
+        pass
\ No newline at end of file
diff --git a/dlio_benchmark/reader/dali_npz_reader.py b/dlio_benchmark/reader/dali_npz_reader.py
new file mode 100644
index 00000000..f2887ef5
--- /dev/null
+++ b/dlio_benchmark/reader/dali_npz_reader.py
@@ -0,0 +1,68 @@
+"""
+   Copyright (c) 2022, UChicago Argonne, LLC
+   All Rights Reserved
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+import math
+import logging
+from time import time
+
+import nvidia.dali.fn as fn
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.reader.dali_base_reader import DaliBaseReader
+from dlio_benchmark.reader.tf_base_reader import TFBaseReader
+from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile
+from dlio_benchmark.common.enumerations import DatasetType, Shuffle
+import nvidia.dali.tfrecord as tfrec
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+class DaliNPZReader(DaliBaseReader):
+    @dlp.log_init
+    def __init__(self, dataset_type):
+        super().__init__(dataset_type)
+
+    @dlp.log
+    def _load(self):
+        logging.debug(
+            f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}")
+        random_shuffle = False
+        seed = -1
+        seed_change_epoch = False
+        if self._args.sample_shuffle is not Shuffle.OFF:
+            if self._args.sample_shuffle is not Shuffle.SEED:
+                seed = self._args.seed
+            random_shuffle = True
+            seed_change_epoch = True
+        initial_fill = 1024
+        if self._args.shuffle_size > 0:
+            initial_fill = self._args.shuffle_size
+        prefetch_size = 1
+        if self._args.prefetch_size > 0:
+            prefetch_size = self._args.prefetch_size
+
+        stick_to_shard = True
+        if seed_change_epoch:
+            stick_to_shard = False
+
+        dataset = fn.readers.numpy(device='cpu', files=self.file_list, num_shards=self._args.comm_size,
+                                   prefetch_queue_depth=prefetch_size, initial_fill=initial_fill,
+                                   random_shuffle=random_shuffle, seed=seed, shuffle_after_epoch=seed_change_epoch,
+                                   stick_to_shard=stick_to_shard, pad_last_batch=True)
+        return dataset
+
+    @dlp.log
+    def finalize(self):
+        pass
diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py
new file mode 100644
index 00000000..4b8147af
--- /dev/null
+++ b/dlio_benchmark/reader/dali_tfrecord_reader.py
@@ -0,0 +1,78 @@
+"""
+   Copyright (c) 2022, UChicago Argonne, LLC
+   All Rights Reserved
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+import os.path
+
+import math
+import logging
+from time import time
+
+import nvidia
+import nvidia.dali.fn as fn
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.reader.dali_base_reader import DaliBaseReader
+from dlio_benchmark.reader.tf_base_reader import TFBaseReader
+from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile
+from dlio_benchmark.common.enumerations import DatasetType, Shuffle
+import nvidia.dali.tfrecord as tfrec
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+class DaliTFRecordReader(DaliBaseReader):
+    @dlp.log_init
+    def __init__(self, dataset_type):
+        super().__init__(dataset_type)
+
+    @dlp.log
+    def _load(self):
+        folder = "valid"
+        if self.dataset_type == DatasetType.TRAIN:
+            folder = "train"
+        index_folder = f"{self._args.data_folder}/index/{folder}"
+        index_files = []
+        for file in self.file_list:
+            filename = os.path.basename(file)
+            index_files.append(f"{index_folder}/{filename}.idx")
+        logging.info(
+            f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}")
+        random_shuffle = False
+        seed = -1
+        if self._args.sample_shuffle is not Shuffle.OFF:
+            if self._args.sample_shuffle is not Shuffle.SEED:
+                seed = self._args.seed
+            random_shuffle = True
+        initial_fill = 1024
+        if self._args.shuffle_size > 0:
+            initial_fill = self._args.shuffle_size
+        prefetch_size = 1
+        if self._args.prefetch_size > 0:
+            prefetch_size = self._args.prefetch_size
+        dataset = fn.readers.tfrecord(path=self.file_list,
+                                      index_path=index_files,
+                                      features={
+                                          'image': tfrec.FixedLenFeature((), tfrec.string, ""),
+                                          'size': tfrec.FixedLenFeature([1], tfrec.int64, 0)
+                                      }, num_shards=self._args.comm_size,
+                                      prefetch_queue_depth=prefetch_size,
+                                      initial_fill=initial_fill,
+                                      random_shuffle=random_shuffle, seed=seed,
+                                      stick_to_shard=True, pad_last_batch=True)
+        return dataset["image"]
+
+    @dlp.log
+    def finalize(self):
+        pass
diff --git a/dlio_benchmark/reader/png_reader.py b/dlio_benchmark/reader/image_reader.py
similarity index 96%
rename from dlio_benchmark/reader/png_reader.py
rename to dlio_benchmark/reader/image_reader.py
index 64183dd3..1fe63a05 100644
--- a/dlio_benchmark/reader/png_reader.py
+++ b/dlio_benchmark/reader/image_reader.py
@@ -26,9 +26,9 @@
 
 dlp = Profile(MODULE_DATA_READER)
 
-class PNGReader(FormatReader):
+class ImageReader(FormatReader):
     """
-    Reader for PNG files
+    Reader for PNG / JPEG files
     """
 
     @dlp.log_init
diff --git a/dlio_benchmark/reader/jpeg_reader.py b/dlio_benchmark/reader/jpeg_reader.py
deleted file mode 100644
index 664cde04..00000000
--- a/dlio_benchmark/reader/jpeg_reader.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""
-   Copyright (c) 2022, UChicago Argonne, LLC
-   All Rights Reserved
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-"""
-
-import numpy as np
-from PIL import Image
-
-from dlio_benchmark.common.constants import MODULE_DATA_READER
-from dlio_benchmark.reader.reader_handler import FormatReader
-from dlio_profiler.logger import fn_interceptor as Profile
-
-dlp = Profile(MODULE_DATA_READER)
-
-
-class JPEGReader(FormatReader):
-    """
-    Reader for JPEG files
-    """
-
-    @dlp.log_init
-    def __init__(self, dataset_type, thread_index, epoch):
-        super().__init__(dataset_type, thread_index)
-
-    @dlp.log
-    def open(self, filename):
-        super().open(filename)
-        return np.asarray(Image.open(filename))
-
-    @dlp.log
-    def close(self, filename):
-        super().close(filename)
-
-    @dlp.log
-    def get_sample(self, filename, sample_index):
-        super().get_sample(filename, sample_index)
-        image = self.open_file_map[filename]
-        dlp.update(image_size=image.nbytes)
-
-    def next(self):
-        for batch in super().next():
-            yield batch
-
-    @dlp.log
-    def read_index(self, image_idx, step):
-        return super().read_index(image_idx, step)
-
-    @dlp.log
-    def finalize(self):
-        return super().finalize()
diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py
deleted file mode 100644
index f0144f74..00000000
--- a/dlio_benchmark/reader/npz_reader.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""
-   Copyright (c) 2022, UChicago Argonne, LLC
-   All Rights Reserved
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-"""
-import numpy as np
-
-from dlio_benchmark.common.constants import MODULE_DATA_READER
-from dlio_benchmark.reader.reader_handler import FormatReader
-from dlio_profiler.logger import fn_interceptor as Profile
-
-dlp = Profile(MODULE_DATA_READER)
-
-
-class NPZReader(FormatReader):
-    """
-    Reader for NPZ files
-    """
-
-    @dlp.log_init
-    def __init__(self, dataset_type, thread_index, epoch):
-        super().__init__(dataset_type, thread_index)
-
-    @dlp.log
-    def open(self, filename):
-        super().open(filename)
-        return np.load(filename, allow_pickle=True)['x']
-
-    @dlp.log
-    def close(self, filename):
-        super().close(filename)
-
-    @dlp.log
-    def get_sample(self, filename, sample_index):
-        super().get_sample(filename, sample_index)
-        image = self.open_file_map[filename][..., sample_index]
-        dlp.update(image_size=image.nbytes)
-
-    def next(self):
-        for batch in super().next():
-            yield batch
-
-    @dlp.log
-    def read_index(self, image_idx, step):
-        return super().read_index(image_idx, step)
-
-    @dlp.log
-    def finalize(self):
-        return super().finalize()
\ No newline at end of file
diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py
index 74fc353e..e6055dc4 100644
--- a/dlio_benchmark/reader/reader_factory.py
+++ b/dlio_benchmark/reader/reader_factory.py
@@ -43,18 +43,24 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
         elif type == FormatType.CSV:
             from dlio_benchmark.reader.csv_reader import CSVReader
             return CSVReader(dataset_type, thread_index, epoch_number)
-        elif type == FormatType.JPEG:
-            from dlio_benchmark.reader.jpeg_reader import JPEGReader
-            return JPEGReader(dataset_type, thread_index, epoch_number)
-        elif type == FormatType.PNG:
-            from dlio_benchmark.reader.png_reader import PNGReader
-            return PNGReader(dataset_type, thread_index, epoch_number)
+        elif type == FormatType.JPEG or FormatType.PNG:
+            if _args.data_loader == DataLoaderType.NATIVE_DALI
+                from dlio_benchmark.reader.image_reader import ImageReader
+                return DaliImageReader(dataset_type, thread_index, epoch_number)
         elif type == FormatType.NPZ:
-            from dlio_benchmark.reader.npz_reader import NPZReader
-            return NPZReader(dataset_type, thread_index, epoch_number)
+            if _args.data_loader == DataLoaderType.NATIVE_DALI
+                from dlio_benchmark.reader.dali_npz_reader import DaliNPZReader
+                return DaliNPZReader(dataset_type, thread_index, epoch_number)
+            else:
+                from dlio_benchmark.reader.npz_reader import NPZReader
+                return NPZReader(dataset_type, thread_index, epoch_number)
         elif type == FormatType.TFRECORD:
-            from dlio_benchmark.reader.tf_reader import TFReader
-            return TFReader(dataset_type, thread_index, epoch_number)
+            if _args.data_loader == DataLoaderType.NATIVE_DALI: 
+                from dlio_benchmark.reader.dali_tf_reader import DaliTFReader
+                return TFReader(dataset_type, thread_index, epoch_number)
+            else:
+                from dlio_benchmark.reader.tf_reader import TFReader
+                return TFReader(dataset_type, thread_index, epoch_number) 
         else:
             print("Loading data of %s format is not supported without framework data loader" %type)
             raise Exception(type)

From 63cb1c16be269a2d648d3662c4ad8e2ed79b0162 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Fri, 1 Dec 2023 20:25:43 +0000
Subject: [PATCH 03/36] added back npz_reader

---
 dlio_benchmark/reader/npz_reader.py | 60 +++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 dlio_benchmark/reader/npz_reader.py

diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py
new file mode 100644
index 00000000..f0144f74
--- /dev/null
+++ b/dlio_benchmark/reader/npz_reader.py
@@ -0,0 +1,60 @@
+"""
+   Copyright (c) 2022, UChicago Argonne, LLC
+   All Rights Reserved
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+import numpy as np
+
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_profiler.logger import fn_interceptor as Profile
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+class NPZReader(FormatReader):
+    """
+    Reader for NPZ files
+    """
+
+    @dlp.log_init
+    def __init__(self, dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
+
+    @dlp.log
+    def open(self, filename):
+        super().open(filename)
+        return np.load(filename, allow_pickle=True)['x']
+
+    @dlp.log
+    def close(self, filename):
+        super().close(filename)
+
+    @dlp.log
+    def get_sample(self, filename, sample_index):
+        super().get_sample(filename, sample_index)
+        image = self.open_file_map[filename][..., sample_index]
+        dlp.update(image_size=image.nbytes)
+
+    def next(self):
+        for batch in super().next():
+            yield batch
+
+    @dlp.log
+    def read_index(self, image_idx, step):
+        return super().read_index(image_idx, step)
+
+    @dlp.log
+    def finalize(self):
+        return super().finalize()
\ No newline at end of file

From cdd73894ef95fa0760adda479fc98ddd5326e77e Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Fri, 1 Dec 2023 20:30:16 +0000
Subject: [PATCH 04/36] fixed bugs

---
 dlio_benchmark/reader/reader_factory.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py
index e6055dc4..3ec683fb 100644
--- a/dlio_benchmark/reader/reader_factory.py
+++ b/dlio_benchmark/reader/reader_factory.py
@@ -44,11 +44,11 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
             from dlio_benchmark.reader.csv_reader import CSVReader
             return CSVReader(dataset_type, thread_index, epoch_number)
         elif type == FormatType.JPEG or FormatType.PNG:
-            if _args.data_loader == DataLoaderType.NATIVE_DALI
+            if _args.data_loader == DataLoaderType.NATIVE_DALI:
                 from dlio_benchmark.reader.image_reader import ImageReader
                 return DaliImageReader(dataset_type, thread_index, epoch_number)
         elif type == FormatType.NPZ:
-            if _args.data_loader == DataLoaderType.NATIVE_DALI
+            if _args.data_loader == DataLoaderType.NATIVE_DALI:
                 from dlio_benchmark.reader.dali_npz_reader import DaliNPZReader
                 return DaliNPZReader(dataset_type, thread_index, epoch_number)
             else:

From 4f0008e940dcf6cd31bc88dfd96353c430d26378 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Fri, 1 Dec 2023 20:43:36 +0000
Subject: [PATCH 05/36] fixed bugs

---
 dlio_benchmark/reader/reader_factory.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py
index 3ec683fb..fdd12ee0 100644
--- a/dlio_benchmark/reader/reader_factory.py
+++ b/dlio_benchmark/reader/reader_factory.py
@@ -43,7 +43,7 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
         elif type == FormatType.CSV:
             from dlio_benchmark.reader.csv_reader import CSVReader
             return CSVReader(dataset_type, thread_index, epoch_number)
-        elif type == FormatType.JPEG or FormatType.PNG:
+        elif type == FormatType.JPEG or type == FormatType.PNG:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
                 from dlio_benchmark.reader.image_reader import ImageReader
                 return DaliImageReader(dataset_type, thread_index, epoch_number)
@@ -57,7 +57,7 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
         elif type == FormatType.TFRECORD:
             if _args.data_loader == DataLoaderType.NATIVE_DALI: 
                 from dlio_benchmark.reader.dali_tf_reader import DaliTFReader
-                return TFReader(dataset_type, thread_index, epoch_number)
+                return DaliTFReader(dataset_type, thread_index, epoch_number)
             else:
                 from dlio_benchmark.reader.tf_reader import TFReader
                 return TFReader(dataset_type, thread_index, epoch_number) 

From fbc9748b3f6416696db21e38b619b62c8818a4d3 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Fri, 1 Dec 2023 20:48:50 +0000
Subject: [PATCH 06/36] fixed image reader issue

---
 dlio_benchmark/reader/reader_factory.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py
index fdd12ee0..b7f39cb5 100644
--- a/dlio_benchmark/reader/reader_factory.py
+++ b/dlio_benchmark/reader/reader_factory.py
@@ -45,8 +45,11 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
             return CSVReader(dataset_type, thread_index, epoch_number)
         elif type == FormatType.JPEG or type == FormatType.PNG:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
-                from dlio_benchmark.reader.image_reader import ImageReader
+                from dlio_benchmark.reader.dali_image_reader import DaliImageReader
                 return DaliImageReader(dataset_type, thread_index, epoch_number)
+            else:
+                from dlio_benchmark.reader.image_reader import ImageReader
+                return ImageReader(dataset_type, thread_index, epoch_number)            
         elif type == FormatType.NPZ:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
                 from dlio_benchmark.reader.dali_npz_reader import DaliNPZReader

From 9bde93cad3f7c39dff1f9290403b03fffffff884 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <huihuo.zheng@anl.gov>
Date: Wed, 6 Dec 2023 21:48:22 +0000
Subject: [PATCH 07/36] fixed Profile, PerfTrace

---
 dlio_benchmark/data_generator/tf_generator.py | 15 ++++-
 .../data_loader/native_dali_data_loader.py    |  3 +-
 dlio_benchmark/reader/dali_base_reader.py     | 65 +++++++++++++++++++
 dlio_benchmark/reader/dali_image_reader.py    |  4 +-
 dlio_benchmark/reader/dali_npz_reader.py      |  4 +-
 dlio_benchmark/reader/dali_tfrecord_reader.py |  4 +-
 dlio_benchmark/reader/reader_factory.py       |  8 +--
 dlio_benchmark/utils/config.py                |  6 +-
 8 files changed, 93 insertions(+), 16 deletions(-)
 create mode 100644 dlio_benchmark/reader/dali_base_reader.py

diff --git a/dlio_benchmark/data_generator/tf_generator.py b/dlio_benchmark/data_generator/tf_generator.py
index f10a9621..b1a94c9c 100644
--- a/dlio_benchmark/data_generator/tf_generator.py
+++ b/dlio_benchmark/data_generator/tf_generator.py
@@ -14,12 +14,15 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 """
+import os
+from subprocess import call
 
 from dlio_benchmark.data_generator.data_generator import DataGenerator
 import numpy as np
 import tensorflow as tf
-from dlio_benchmark.utils.utility import progress, utcnow
 from dlio_profiler.logger import fn_interceptor as Profile
+
+from dlio_benchmark.utils.utility import progress, utcnow
 from shutil import copyfile
 from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
 
@@ -64,4 +67,14 @@ def generate(self):
                     serialized = example.SerializeToString()
                     # Write the serialized data to the TFRecords file.
                     writer.write(serialized)
+            tfrecord2idx_script = "tfrecord2idx"
+            folder = "train"
+            if "valid" in out_path_spec:
+                folder = "valid"
+            index_folder = f"{self._args.data_folder}/index/{folder}"
+            filename = os.path.basename(out_path_spec)
+            self.storage.create_node(index_folder, exist_ok=True)
+            tfrecord_idx = f"{index_folder}/{filename}.idx"
+            if not os.path.isfile(tfrecord_idx):
+                call([tfrecord2idx_script, out_path_spec, tfrecord_idx])
         np.random.seed()
diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py
index 2df04a5e..86713315 100644
--- a/dlio_benchmark/data_loader/native_dali_data_loader.py
+++ b/dlio_benchmark/data_loader/native_dali_data_loader.py
@@ -12,7 +12,8 @@
 from dlio_benchmark.common.enumerations import Shuffle, DataLoaderType, DatasetType
 from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
 from dlio_benchmark.reader.reader_factory import ReaderFactory
-from dlio_benchmark.utils.utility import utcnow, get_rank, timeit, Profile
+from dlio_benchmark.utils.utility import utcnow, get_rank, timeit
+from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
 
 dlp = Profile(MODULE_DATA_LOADER)
 
diff --git a/dlio_benchmark/reader/dali_base_reader.py b/dlio_benchmark/reader/dali_base_reader.py
new file mode 100644
index 00000000..b9dac9f1
--- /dev/null
+++ b/dlio_benchmark/reader/dali_base_reader.py
@@ -0,0 +1,65 @@
+"""
+   Copyright (c) 2022, UChicago Argonne, LLC
+   All Rights Reserved
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+from time import sleep
+
+import numpy as np
+import nvidia
+from abc import ABC, abstractmethod
+from nvidia import dali
+
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.common.enumerations import DatasetType
+from dlio_benchmark.utils.config import ConfigArguments
+from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
+
+from nvidia.dali import fn
+dlp = Profile(MODULE_DATA_READER)
+
+class DaliBaseReader(ABC):
+
+    @dlp.log_init
+    def __init__(self, dataset_type):
+        self.dataset_type = dataset_type
+        self._args = ConfigArguments.get_instance()
+        self.batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval
+        self.file_list = self._args.file_list_train if self.dataset_type is DatasetType.TRAIN else self._args.file_list_eval
+
+    @dlp.log
+    def _preprocess(self, dataset):
+        if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.:
+            t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev)
+            sleep(max(t, 0.0))
+        return dataset
+
+    @dlp.log
+    def _resize(self, dataset):
+        return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension])
+
+    @abstractmethod
+    def _load(self):
+        pass
+
+    @dlp.log
+    def read(self):
+        dataset = self._load()
+        #dataset = self._resize(dataset)
+        #dataset = nvidia.dali.fn.python_function(dataset, function= self._preprocess, num_outputs=1)
+        return dataset
+
+    @abstractmethod
+    def finalize(self):
+        pass
diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
index adbc2c55..98f839b1 100644
--- a/dlio_benchmark/reader/dali_image_reader.py
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -21,10 +21,10 @@
 import nvidia.dali.fn as fn
 from dlio_benchmark.common.constants import MODULE_DATA_READER
 from dlio_benchmark.reader.dali_base_reader import DaliBaseReader
-from dlio_benchmark.reader.tf_base_reader import TFBaseReader
-from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile
+from dlio_benchmark.utils.utility import utcnow
 from dlio_benchmark.common.enumerations import DatasetType, Shuffle
 import nvidia.dali.tfrecord as tfrec
+from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
 
 dlp = Profile(MODULE_DATA_READER)
 
diff --git a/dlio_benchmark/reader/dali_npz_reader.py b/dlio_benchmark/reader/dali_npz_reader.py
index f2887ef5..9114716f 100644
--- a/dlio_benchmark/reader/dali_npz_reader.py
+++ b/dlio_benchmark/reader/dali_npz_reader.py
@@ -21,10 +21,10 @@
 import nvidia.dali.fn as fn
 from dlio_benchmark.common.constants import MODULE_DATA_READER
 from dlio_benchmark.reader.dali_base_reader import DaliBaseReader
-from dlio_benchmark.reader.tf_base_reader import TFBaseReader
-from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile
+from dlio_benchmark.utils.utility import utcnow
 from dlio_benchmark.common.enumerations import DatasetType, Shuffle
 import nvidia.dali.tfrecord as tfrec
+from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
 
 dlp = Profile(MODULE_DATA_READER)
 
diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py
index 4b8147af..d66285d5 100644
--- a/dlio_benchmark/reader/dali_tfrecord_reader.py
+++ b/dlio_benchmark/reader/dali_tfrecord_reader.py
@@ -24,10 +24,10 @@
 import nvidia.dali.fn as fn
 from dlio_benchmark.common.constants import MODULE_DATA_READER
 from dlio_benchmark.reader.dali_base_reader import DaliBaseReader
-from dlio_benchmark.reader.tf_base_reader import TFBaseReader
-from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile
+from dlio_benchmark.utils.utility import utcnow
 from dlio_benchmark.common.enumerations import DatasetType, Shuffle
 import nvidia.dali.tfrecord as tfrec
+from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
 
 dlp = Profile(MODULE_DATA_READER)
 
diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py
index b7f39cb5..10d7bc6c 100644
--- a/dlio_benchmark/reader/reader_factory.py
+++ b/dlio_benchmark/reader/reader_factory.py
@@ -46,21 +46,21 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
         elif type == FormatType.JPEG or type == FormatType.PNG:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
                 from dlio_benchmark.reader.dali_image_reader import DaliImageReader
-                return DaliImageReader(dataset_type, thread_index, epoch_number)
+                return DaliImageReader(dataset_type)
             else:
                 from dlio_benchmark.reader.image_reader import ImageReader
                 return ImageReader(dataset_type, thread_index, epoch_number)            
         elif type == FormatType.NPZ:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
                 from dlio_benchmark.reader.dali_npz_reader import DaliNPZReader
-                return DaliNPZReader(dataset_type, thread_index, epoch_number)
+                return DaliNPZReader(dataset_type)
             else:
                 from dlio_benchmark.reader.npz_reader import NPZReader
                 return NPZReader(dataset_type, thread_index, epoch_number)
         elif type == FormatType.TFRECORD:
             if _args.data_loader == DataLoaderType.NATIVE_DALI: 
-                from dlio_benchmark.reader.dali_tf_reader import DaliTFReader
-                return DaliTFReader(dataset_type, thread_index, epoch_number)
+                from dlio_benchmark.reader.dali_tfrecord_reader import DaliTFRecordReader
+                return DaliTFRecordReader(dataset_type)
             else:
                 from dlio_benchmark.reader.tf_reader import TFReader
                 return TFReader(dataset_type, thread_index, epoch_number) 
diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
index c6d319d7..05bcd400 100644
--- a/dlio_benchmark/utils/config.py
+++ b/dlio_benchmark/utils/config.py
@@ -149,10 +149,8 @@ def validate(self):
         if (self.do_profiling == True) and (self.profiler == Profiler('darshan')):
             if ('LD_PRELOAD' not in os.environ or os.environ["LD_PRELOAD"].find("libdarshan") == -1):
                 raise Exception("Please set darshan runtime library in LD_PRELOAD")
-        if self.format is FormatType.TFRECORD and self.framework is not FrameworkType.TENSORFLOW:
-            raise Exception(f"{self.framework} support for tfrecord is not implemented.")
-        if self.format is FormatType.TFRECORD and self.data_loader is not DataLoaderType.TENSORFLOW:
-            raise Exception(f"{self.data_loader} support for tfrecord is not implemented.")
+        if self.format is FormatType.TFRECORD and (self.data_loader is DataLoaderType.PYTORCH):
+            raise Exception(f"{self.framework} support for tfrecord is not implemented for {self.data_loader}.")
         if (self.framework == FrameworkType.TENSORFLOW and self.data_loader == DataLoaderType.PYTORCH) or (
                 self.framework == FrameworkType.PYTORCH and self.data_loader == DataLoaderType.TENSORFLOW):
             raise Exception("Imcompatible between framework and data_loader setup.")

From c976bc54fe59feae2a88439c0d35d49848723c46 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <huihuo.zheng@anl.gov>
Date: Wed, 6 Dec 2023 21:52:01 +0000
Subject: [PATCH 08/36] removed unnecessary logs

---
 dlio_benchmark/data_loader/native_dali_data_loader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py
index 86713315..838f2d1b 100644
--- a/dlio_benchmark/data_loader/native_dali_data_loader.py
+++ b/dlio_benchmark/data_loader/native_dali_data_loader.py
@@ -43,7 +43,6 @@ def read(self):
                                                       epoch_number=self.epoch_number).read()
             pipeline.set_outputs(images)
         self.pipelines.append(pipeline)
-        logging.info(f"{utcnow()} Creating {num_threads} pipelines by {self._args.my_rank} rank ")
 
     @dlp.log
     def next(self):

From 58febf3298ce3feeb1c4f948c16cd6551c7f74b3 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <huihuo.zheng@anl.gov>
Date: Wed, 6 Dec 2023 22:01:13 +0000
Subject: [PATCH 09/36] fixed dali_image_reader

---
 dlio_benchmark/reader/dali_image_reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
index 98f839b1..08518545 100644
--- a/dlio_benchmark/reader/dali_image_reader.py
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -56,7 +56,7 @@ def _load(self):
         stick_to_shard = True
         if seed_change_epoch:
             stick_to_shard = False
-        images, labels = fn.readers.file(files=files, num_shards=self._args.comm_size, 
+        images, labels = fn.readers.file(files=self.file_list, num_shards=self._args.comm_size, 
                                          prefetch_queue_depth=prefetch_size, 
                                          initial_fill=initial_fill, random_shuffle=random_shuffle, 
                                          shuffle_after_epoch=seed_change_epoch, 

From 3cee0fda1189558b7f153feb655011bbdde63cb0 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <huihuo.zheng@anl.gov>
Date: Wed, 6 Dec 2023 22:02:00 +0000
Subject: [PATCH 10/36] fixed dali_image_reader

---
 dlio_benchmark/reader/dali_image_reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
index 08518545..b814de86 100644
--- a/dlio_benchmark/reader/dali_image_reader.py
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -61,7 +61,7 @@ def _load(self):
                                          initial_fill=initial_fill, random_shuffle=random_shuffle, 
                                          shuffle_after_epoch=seed_change_epoch, 
                                          stick_to_shard=stick_to_shard, pad_last_batch=True)
-        dataset = fn.decoders.image(jpegs, device='cpu')
+        dataset = fn.decoders.image(images, device='cpu')
         return dataset
 
     @dlp.log

From a666d5c3d5f3a7ebca1c1013d3a3756a6907d688 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <huihuo.zheng@anl.gov>
Date: Wed, 6 Dec 2023 22:15:40 +0000
Subject: [PATCH 11/36] added support for npy format

---
 dlio_benchmark/common/enumerations.py         |  3 +
 .../data_generator/generator_factory.py       |  3 +
 ...{dali_npz_reader.py => dali_npy_reader.py} |  2 +-
 dlio_benchmark/reader/npy_reader.py           | 60 +++++++++++++++++++
 dlio_benchmark/reader/reader_factory.py       | 13 +++-
 5 files changed, 77 insertions(+), 4 deletions(-)
 rename dlio_benchmark/reader/{dali_npz_reader.py => dali_npy_reader.py} (98%)
 create mode 100644 dlio_benchmark/reader/npy_reader.py

diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py
index 0081195e..c8c448b5 100644
--- a/dlio_benchmark/common/enumerations.py
+++ b/dlio_benchmark/common/enumerations.py
@@ -93,6 +93,7 @@ class FormatType(Enum):
     HDF5 = 'hdf5'
     CSV = 'csv'
     NPZ = 'npz'
+    NPY = 'npy'
     HDF5_OPT = 'hdf5_opt'
     JPEG = 'jpeg'
     PNG = 'png'
@@ -110,6 +111,8 @@ def get_enum(value):
             return FormatType.CSV
         elif FormatType.NPZ.value == value:
             return FormatType.NPZ
+        elif FormatType.NPY.value == value:
+            return FormatType.NPY            
         elif FormatType.HDF5_OPT.value == value:
             return FormatType.HDF5_OPT
         elif FormatType.JPEG.value == value:
diff --git a/dlio_benchmark/data_generator/generator_factory.py b/dlio_benchmark/data_generator/generator_factory.py
index 7c05d3a4..e61ead4c 100644
--- a/dlio_benchmark/data_generator/generator_factory.py
+++ b/dlio_benchmark/data_generator/generator_factory.py
@@ -38,6 +38,9 @@ def get_generator(type):
         elif type == FormatType.NPZ:
             from dlio_benchmark.data_generator.npz_generator import NPZGenerator
             return NPZGenerator()
+        elif type == FormatType.NPY:
+            from dlio_benchmark.data_generator.npy_generator import NPYGenerator
+            return NPYGenerator()            
         elif type == FormatType.JPEG:
             from dlio_benchmark.data_generator.jpeg_generator import JPEGGenerator
             return JPEGGenerator()
diff --git a/dlio_benchmark/reader/dali_npz_reader.py b/dlio_benchmark/reader/dali_npy_reader.py
similarity index 98%
rename from dlio_benchmark/reader/dali_npz_reader.py
rename to dlio_benchmark/reader/dali_npy_reader.py
index 9114716f..ee0592ef 100644
--- a/dlio_benchmark/reader/dali_npz_reader.py
+++ b/dlio_benchmark/reader/dali_npy_reader.py
@@ -29,7 +29,7 @@
 dlp = Profile(MODULE_DATA_READER)
 
 
-class DaliNPZReader(DaliBaseReader):
+class DaliNPYReader(DaliBaseReader):
     @dlp.log_init
     def __init__(self, dataset_type):
         super().__init__(dataset_type)
diff --git a/dlio_benchmark/reader/npy_reader.py b/dlio_benchmark/reader/npy_reader.py
new file mode 100644
index 00000000..435df6b3
--- /dev/null
+++ b/dlio_benchmark/reader/npy_reader.py
@@ -0,0 +1,60 @@
+"""
+   Copyright (c) 2022, UChicago Argonne, LLC
+   All Rights Reserved
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+import numpy as np
+
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_profiler.logger import fn_interceptor as Profile
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+class NPYReader(FormatReader):
+    """
+    Reader for NPZ files
+    """
+
+    @dlp.log_init
+    def __init__(self, dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
+
+    @dlp.log
+    def open(self, filename):
+        super().open(filename)
+        return np.load(filename)
+
+    @dlp.log
+    def close(self, filename):
+        super().close(filename)
+
+    @dlp.log
+    def get_sample(self, filename, sample_index):
+        super().get_sample(filename, sample_index)
+        image = self.open_file_map[filename][..., sample_index]
+        dlp.update(image_size=image.nbytes)
+
+    def next(self):
+        for batch in super().next():
+            yield batch
+
+    @dlp.log
+    def read_index(self, image_idx, step):
+        return super().read_index(image_idx, step)
+
+    @dlp.log
+    def finalize(self):
+        return super().finalize()
\ No newline at end of file
diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py
index 10d7bc6c..68f9bbb8 100644
--- a/dlio_benchmark/reader/reader_factory.py
+++ b/dlio_benchmark/reader/reader_factory.py
@@ -49,11 +49,18 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
                 return DaliImageReader(dataset_type)
             else:
                 from dlio_benchmark.reader.image_reader import ImageReader
-                return ImageReader(dataset_type, thread_index, epoch_number)            
+                return ImageReader(dataset_type, thread_index, epoch_number)   
+        elif type == FormatType.NPY:
+            if _args.data_loader == DataLoaderType.NATIVE_DALI:
+                from dlio_benchmark.reader.dali_npy_reader import DaliNPYReader
+                return DaliNPYReader(dataset_type)
+            else:
+                from dlio_benchmark.reader.npy_reader import NPYReader
+                return NPYReader(dataset_type, thread_index, epoch_number)                         
         elif type == FormatType.NPZ:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
-                from dlio_benchmark.reader.dali_npz_reader import DaliNPZReader
-                return DaliNPZReader(dataset_type)
+                print("Loading data of %s format is not supported without framework data loader" %type)
+                raise Exception(type)
             else:
                 from dlio_benchmark.reader.npz_reader import NPZReader
                 return NPZReader(dataset_type, thread_index, epoch_number)

From f0722b6fbac42d2ec51107df20c330c66e8e62df Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <huihuo.zheng@anl.gov>
Date: Wed, 6 Dec 2023 22:17:51 +0000
Subject: [PATCH 12/36] added support for npy format

---
 dlio_benchmark/data_loader/native_dali_data_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py
index 838f2d1b..8799800b 100644
--- a/dlio_benchmark/data_loader/native_dali_data_loader.py
+++ b/dlio_benchmark/data_loader/native_dali_data_loader.py
@@ -52,7 +52,7 @@ def next(self):
         for step in range(num_samples // batch_size):
             _dataset = DALIGenericIterator(self.pipelines, ['data'])
             for batch in _dataset:
-                logging.info(f"{utcnow()} Creating {len(batch)} batches by {self._args.my_rank} rank ")
+                logging.debug(f"{utcnow()} Creating {len(batch)} batches by {self._args.my_rank} rank ")
                 yield batch
 
     @dlp.log

From 7155d5d87e79e54102aaa0046b1833558a248a96 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Fri, 8 Dec 2023 19:35:33 +0000
Subject: [PATCH 13/36] changed enumerations

---
 dlio_benchmark/common/enumerations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py
index 0081195e..ef5b48a8 100644
--- a/dlio_benchmark/common/enumerations.py
+++ b/dlio_benchmark/common/enumerations.py
@@ -124,7 +124,7 @@ class DataLoaderType(Enum):
     TENSORFLOW='tensorflow'
     PYTORCH='pytorch'
     DALI='dali'
-    NATIVE_DALI = 'native_dali'
+    NATIVE_DALI='native_dali'
     CUSTOM='custom'
     NONE='none'
     

From 9935840128695b848e550b4aff0f4409035d1594 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 16:54:39 +0000
Subject: [PATCH 14/36] added removed dali base reader

---
 dlio_benchmark/reader/dali_base_reader.py     | 65 -------------------
 dlio_benchmark/reader/dali_image_reader.py    | 23 +++++--
 dlio_benchmark/reader/dali_npy_reader.py      | 24 +++++--
 dlio_benchmark/reader/dali_tfrecord_reader.py | 25 +++++--
 dlio_benchmark/reader/reader_factory.py       |  6 +-
 dlio_benchmark/reader/reader_handler.py       |  4 ++
 6 files changed, 63 insertions(+), 84 deletions(-)
 delete mode 100644 dlio_benchmark/reader/dali_base_reader.py

diff --git a/dlio_benchmark/reader/dali_base_reader.py b/dlio_benchmark/reader/dali_base_reader.py
deleted file mode 100644
index b9dac9f1..00000000
--- a/dlio_benchmark/reader/dali_base_reader.py
+++ /dev/null
@@ -1,65 +0,0 @@
-"""
-   Copyright (c) 2022, UChicago Argonne, LLC
-   All Rights Reserved
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-"""
-from time import sleep
-
-import numpy as np
-import nvidia
-from abc import ABC, abstractmethod
-from nvidia import dali
-
-from dlio_benchmark.common.constants import MODULE_DATA_READER
-from dlio_benchmark.common.enumerations import DatasetType
-from dlio_benchmark.utils.config import ConfigArguments
-from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
-
-from nvidia.dali import fn
-dlp = Profile(MODULE_DATA_READER)
-
-class DaliBaseReader(ABC):
-
-    @dlp.log_init
-    def __init__(self, dataset_type):
-        self.dataset_type = dataset_type
-        self._args = ConfigArguments.get_instance()
-        self.batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval
-        self.file_list = self._args.file_list_train if self.dataset_type is DatasetType.TRAIN else self._args.file_list_eval
-
-    @dlp.log
-    def _preprocess(self, dataset):
-        if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.:
-            t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev)
-            sleep(max(t, 0.0))
-        return dataset
-
-    @dlp.log
-    def _resize(self, dataset):
-        return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension])
-
-    @abstractmethod
-    def _load(self):
-        pass
-
-    @dlp.log
-    def read(self):
-        dataset = self._load()
-        #dataset = self._resize(dataset)
-        #dataset = nvidia.dali.fn.python_function(dataset, function= self._preprocess, num_outputs=1)
-        return dataset
-
-    @abstractmethod
-    def finalize(self):
-        pass
diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
index b814de86..258e9c7d 100644
--- a/dlio_benchmark/reader/dali_image_reader.py
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -20,7 +20,7 @@
 
 import nvidia.dali.fn as fn
 from dlio_benchmark.common.constants import MODULE_DATA_READER
-from dlio_benchmark.reader.dali_base_reader import DaliBaseReader
+from dlio_benchmark.dlio_benchmark.reader.reader_handler import FormatReader
 from dlio_benchmark.utils.utility import utcnow
 from dlio_benchmark.common.enumerations import DatasetType, Shuffle
 import nvidia.dali.tfrecord as tfrec
@@ -29,13 +29,13 @@
 dlp = Profile(MODULE_DATA_READER)
 
 
-class DaliImageReader(DaliBaseReader):
+class DaliImageReader(FormatReader):
     @dlp.log_init
-    def __init__(self, dataset_type):
-        super().__init__(dataset_type)
+    def __init__(dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
 
     @dlp.log
-    def _load(self):
+    def read(self):
         logging.debug(
             f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}")
         random_shuffle = False
@@ -62,8 +62,21 @@ def _load(self):
                                          shuffle_after_epoch=seed_change_epoch, 
                                          stick_to_shard=stick_to_shard, pad_last_batch=True)
         dataset = fn.decoders.image(images, device='cpu')
+        dataset = self._preprocess(images)
+        dataset = self._resize(images)
         return dataset
 
+    @dlp.log
+    def _preprocess(self, dataset):
+        if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.:
+            t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev)
+            sleep(max(t, 0.0))
+        return dataset
+
+    @dlp.log
+    def _resize(self, dataset):
+        return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension])
+
     @dlp.log
     def finalize(self):
         pass
\ No newline at end of file
diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py
index ee0592ef..f2b12730 100644
--- a/dlio_benchmark/reader/dali_npy_reader.py
+++ b/dlio_benchmark/reader/dali_npy_reader.py
@@ -20,7 +20,7 @@
 
 import nvidia.dali.fn as fn
 from dlio_benchmark.common.constants import MODULE_DATA_READER
-from dlio_benchmark.reader.dali_base_reader import DaliBaseReader
+from dlio_benchmark.dlio_benchmark.reader.reader_handler import FormatReader
 from dlio_benchmark.utils.utility import utcnow
 from dlio_benchmark.common.enumerations import DatasetType, Shuffle
 import nvidia.dali.tfrecord as tfrec
@@ -29,13 +29,13 @@
 dlp = Profile(MODULE_DATA_READER)
 
 
-class DaliNPYReader(DaliBaseReader):
+class DaliNPYReader(NativeDaliBaseReader):
     @dlp.log_init
-    def __init__(self, dataset_type):
-        super().__init__(dataset_type)
+    def __init__(dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
 
     @dlp.log
-    def _load(self):
+    def read(self):
         logging.debug(
             f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}")
         random_shuffle = False
@@ -61,8 +61,22 @@ def _load(self):
                                    prefetch_queue_depth=prefetch_size, initial_fill=initial_fill,
                                    random_shuffle=random_shuffle, seed=seed, shuffle_after_epoch=seed_change_epoch,
                                    stick_to_shard=stick_to_shard, pad_last_batch=True)
+
+        dataset = self._preprocess(dataset)
+        dataset = self._resize(dataset)
         return dataset
+    
 
+    @dlp.log
+    def _preprocess(self, dataset):
+        if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.:
+            t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev)
+            sleep(max(t, 0.0))
+        return dataset
+
+    @dlp.log
+    def _resize(self, dataset):
+        return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension])
     @dlp.log
     def finalize(self):
         pass
diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py
index d66285d5..129e6d44 100644
--- a/dlio_benchmark/reader/dali_tfrecord_reader.py
+++ b/dlio_benchmark/reader/dali_tfrecord_reader.py
@@ -23,7 +23,7 @@
 import nvidia
 import nvidia.dali.fn as fn
 from dlio_benchmark.common.constants import MODULE_DATA_READER
-from dlio_benchmark.reader.dali_base_reader import DaliBaseReader
+from dlio_benchmark.dlio_benchmark.reader.reader_handler import FormatReader
 from dlio_benchmark.utils.utility import utcnow
 from dlio_benchmark.common.enumerations import DatasetType, Shuffle
 import nvidia.dali.tfrecord as tfrec
@@ -32,13 +32,13 @@
 dlp = Profile(MODULE_DATA_READER)
 
 
-class DaliTFRecordReader(DaliBaseReader):
+class DaliTFRecordReader(FormatReader):
     @dlp.log_init
-    def __init__(self, dataset_type):
-        super().__init__(dataset_type)
+    def __init__(dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
 
     @dlp.log
-    def _load(self):
+    def read(self):
         folder = "valid"
         if self.dataset_type == DatasetType.TRAIN:
             folder = "train"
@@ -71,7 +71,20 @@ def _load(self):
                                       initial_fill=initial_fill,
                                       random_shuffle=random_shuffle, seed=seed,
                                       stick_to_shard=True, pad_last_batch=True)
-        return dataset["image"]
+        dataset = self._preprocess(dataset["image"])
+        dataset = self._resize(dataset)
+        return dataset
+
+    @dlp.log
+    def _preprocess(self, dataset):
+        if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.:
+            t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev)
+            sleep(max(t, 0.0))
+        return dataset
+
+    @dlp.log
+    def _resize(self, dataset):
+        return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension])
 
     @dlp.log
     def finalize(self):
diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py
index 68f9bbb8..c1404722 100644
--- a/dlio_benchmark/reader/reader_factory.py
+++ b/dlio_benchmark/reader/reader_factory.py
@@ -46,14 +46,14 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
         elif type == FormatType.JPEG or type == FormatType.PNG:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
                 from dlio_benchmark.reader.dali_image_reader import DaliImageReader
-                return DaliImageReader(dataset_type)
+                return DaliImageReader(dataset_type, thread_index, epoch_number)
             else:
                 from dlio_benchmark.reader.image_reader import ImageReader
                 return ImageReader(dataset_type, thread_index, epoch_number)   
         elif type == FormatType.NPY:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
                 from dlio_benchmark.reader.dali_npy_reader import DaliNPYReader
-                return DaliNPYReader(dataset_type)
+                return DaliNPYReader(dataset_type, thread_index, epoch_number)
             else:
                 from dlio_benchmark.reader.npy_reader import NPYReader
                 return NPYReader(dataset_type, thread_index, epoch_number)                         
@@ -67,7 +67,7 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
         elif type == FormatType.TFRECORD:
             if _args.data_loader == DataLoaderType.NATIVE_DALI: 
                 from dlio_benchmark.reader.dali_tfrecord_reader import DaliTFRecordReader
-                return DaliTFRecordReader(dataset_type)
+                return DaliTFRecordReader(dataset_type, thread_index, epoch_number)
             else:
                 from dlio_benchmark.reader.tf_reader import TFReader
                 return TFReader(dataset_type, thread_index, epoch_number) 
diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py
index 48675ab3..1b6f9189 100644
--- a/dlio_benchmark/reader/reader_handler.py
+++ b/dlio_benchmark/reader/reader_handler.py
@@ -124,6 +124,10 @@ def finalize(self):
                 self.close(filename)
                 self.open_file_map[filename] = None
 
+    @abstractmethod
+    def read(self):
+        pass
+
     def __del__(self):
         self.thread_index = None
         self._args = None

From 5dc3907e627da6ad93b196bd73a5ac701778c26d Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 16:56:20 +0000
Subject: [PATCH 15/36] fixed a bug

---
 dlio_benchmark/reader/dali_npy_reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py
index f2b12730..a037a0db 100644
--- a/dlio_benchmark/reader/dali_npy_reader.py
+++ b/dlio_benchmark/reader/dali_npy_reader.py
@@ -29,7 +29,7 @@
 dlp = Profile(MODULE_DATA_READER)
 
 
-class DaliNPYReader(NativeDaliBaseReader):
+class DaliNPYReader(FormatReader):
     @dlp.log_init
     def __init__(dataset_type, thread_index, epoch):
         super().__init__(dataset_type, thread_index)

From 3fb3602d589d0b22f33013d7e22535a77a433070 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 17:04:20 +0000
Subject: [PATCH 16/36] added native-dali-loader tests in github action

---
 .github/workflows/python-package-conda.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index 1108991f..ff8d3ee0 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -149,6 +149,16 @@ jobs:
         source ${VENV}/bin/activate
         mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
         mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+    - name: test-torch-native-dali-loader-npz
+        run: |
+          source ${VENV}/bin/activate
+          mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+          mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+    - name: test-tf-native-dali-loader-npz
+        run: |
+          source ${VENV}/bin/activate
+          mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+          mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
     - name: test_subset
       run: |
         source ${VENV}/bin/activate

From 248cfa2641cc39a1b9a11e2a2ca3de54fe96b2c7 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 17:05:53 +0000
Subject: [PATCH 17/36] corrected github action formats

---
 .github/workflows/python-package-conda.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index ff8d3ee0..f7f579d4 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -150,15 +150,15 @@ jobs:
         mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
         mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
     - name: test-torch-native-dali-loader-npz
-        run: |
-          source ${VENV}/bin/activate
-          mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
-          mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+      run: |
+        source ${VENV}/bin/activate
+        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
     - name: test-tf-native-dali-loader-npz
-        run: |
-          source ${VENV}/bin/activate
-          mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
-          mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+      run: |
+        source ${VENV}/bin/activate
+        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
     - name: test_subset
       run: |
         source ${VENV}/bin/activate

From 344298d68532b5977ede1a3fa9c8e93eb7fa1307 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 17:14:37 +0000
Subject: [PATCH 18/36] fixed read return

---
 dlio_benchmark/reader/reader_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py
index 1b6f9189..6370e4fb 100644
--- a/dlio_benchmark/reader/reader_handler.py
+++ b/dlio_benchmark/reader/reader_handler.py
@@ -126,7 +126,7 @@ def finalize(self):
 
     @abstractmethod
     def read(self):
-        pass
+        return
 
     def __del__(self):
         self.thread_index = None

From ac025eb7ded7bff8e45c70dc42fb015291d70650 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 19:20:59 +0000
Subject: [PATCH 19/36] removed abstractmethod

---
 dlio_benchmark/reader/reader_handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py
index 6370e4fb..8084bbdd 100644
--- a/dlio_benchmark/reader/reader_handler.py
+++ b/dlio_benchmark/reader/reader_handler.py
@@ -124,8 +124,8 @@ def finalize(self):
                 self.close(filename)
                 self.open_file_map[filename] = None
 
-    @abstractmethod
     def read(self):
+        logging.error("This method is not implemented!")
         return
 
     def __del__(self):

From d2d544f58f92afa99191835b68e89d2ce65419dd Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 21:09:51 +0000
Subject: [PATCH 20/36] fixed bugs

---
 dlio_benchmark/reader/dali_image_reader.py    | 22 +++++++++++---
 dlio_benchmark/reader/dali_npy_reader.py      | 29 +++++++++++++++----
 dlio_benchmark/reader/dali_tfrecord_reader.py | 21 +++++++++++---
 dlio_benchmark/reader/hdf5_reader.py          |  4 +++
 dlio_benchmark/reader/image_reader.py         |  4 +++
 dlio_benchmark/reader/npy_reader.py           |  6 +++-
 dlio_benchmark/reader/npz_reader.py           |  6 +++-
 dlio_benchmark/reader/reader_factory.py       |  6 ++--
 dlio_benchmark/reader/reader_handler.py       |  4 +--
 dlio_benchmark/reader/tf_reader.py            |  8 +++--
 10 files changed, 86 insertions(+), 24 deletions(-)

diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
index 258e9c7d..12c2f1da 100644
--- a/dlio_benchmark/reader/dali_image_reader.py
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -31,13 +31,27 @@
 
 class DaliImageReader(FormatReader):
     @dlp.log_init
-    def __init__(dataset_type, thread_index, epoch):
+    def __init__(self, dataset_type, thread_index, epoch):
         super().__init__(dataset_type, thread_index)
+    def open(self):
+        super().open()
+
+    def close(self):
+        super().close()
+    
+    def get_sample(self, filename, sample_index):
+        super().get_sample(filename, sample_index)
+
+    def next(self):
+        super().next()
+
+    def read_index(self):
+        super().read_index()
 
     @dlp.log
     def read(self):
         logging.debug(
-            f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}")
+            f"{utcnow()} Reading {len(self._file_list)} files rank {self._args.my_rank}")
         random_shuffle = False
         seed = -1
         seed_change_epoch = False
@@ -56,7 +70,7 @@ def read(self):
         stick_to_shard = True
         if seed_change_epoch:
             stick_to_shard = False
-        images, labels = fn.readers.file(files=self.file_list, num_shards=self._args.comm_size, 
+        images, labels = fn.readers.file(files=self._file_list, num_shards=self._args.comm_size, 
                                          prefetch_queue_depth=prefetch_size, 
                                          initial_fill=initial_fill, random_shuffle=random_shuffle, 
                                          shuffle_after_epoch=seed_change_epoch, 
@@ -75,7 +89,7 @@ def _preprocess(self, dataset):
 
     @dlp.log
     def _resize(self, dataset):
-        return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension])
+        return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension])
 
     @dlp.log
     def finalize(self):
diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py
index a037a0db..c2d3bf4f 100644
--- a/dlio_benchmark/reader/dali_npy_reader.py
+++ b/dlio_benchmark/reader/dali_npy_reader.py
@@ -20,7 +20,7 @@
 
 import nvidia.dali.fn as fn
 from dlio_benchmark.common.constants import MODULE_DATA_READER
-from dlio_benchmark.dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_benchmark.reader.reader_handler import FormatReader
 from dlio_benchmark.utils.utility import utcnow
 from dlio_benchmark.common.enumerations import DatasetType, Shuffle
 import nvidia.dali.tfrecord as tfrec
@@ -31,13 +31,28 @@
 
 class DaliNPYReader(FormatReader):
     @dlp.log_init
-    def __init__(dataset_type, thread_index, epoch):
+    def __init__(self, dataset_type, thread_index, epoch):
         super().__init__(dataset_type, thread_index)
 
+    def open(self):
+        super().open()
+
+    def close(self):
+        super().close()
+    
+    def get_sample(self, filename, sample_index):
+        super().get_sample(filename, sample_index)
+
+    def next(self):
+        super().next()
+
+    def read_index(self):
+        super().read_index()
+
     @dlp.log
     def read(self):
         logging.debug(
-            f"{utcnow()} Reading {len(self.file_list)} files rank {self._args.my_rank}")
+            f"{utcnow()} Reading {len(self._file_list)} files rank {self._args.my_rank}")
         random_shuffle = False
         seed = -1
         seed_change_epoch = False
@@ -54,10 +69,13 @@ def read(self):
             prefetch_size = self._args.prefetch_size
 
         stick_to_shard = True
+        if random_shuffle == True:
+            seed_change_epoch = False
         if seed_change_epoch:
             stick_to_shard = False
 
-        dataset = fn.readers.numpy(device='cpu', files=self.file_list, num_shards=self._args.comm_size,
+
+        dataset = fn.readers.numpy(device='cpu', files=self._file_list, num_shards=self._args.comm_size,
                                    prefetch_queue_depth=prefetch_size, initial_fill=initial_fill,
                                    random_shuffle=random_shuffle, seed=seed, shuffle_after_epoch=seed_change_epoch,
                                    stick_to_shard=stick_to_shard, pad_last_batch=True)
@@ -76,7 +94,8 @@ def _preprocess(self, dataset):
 
     @dlp.log
     def _resize(self, dataset):
-        return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension])
+        return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension])
+    
     @dlp.log
     def finalize(self):
         pass
diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py
index 129e6d44..856d8bfd 100644
--- a/dlio_benchmark/reader/dali_tfrecord_reader.py
+++ b/dlio_benchmark/reader/dali_tfrecord_reader.py
@@ -34,9 +34,22 @@
 
 class DaliTFRecordReader(FormatReader):
     @dlp.log_init
-    def __init__(dataset_type, thread_index, epoch):
+    def __init__(self, dataset_type, thread_index, epoch):
         super().__init__(dataset_type, thread_index)
+    def open(self):
+        super().open()
 
+    def close(self):
+        super().close()
+    
+    def get_sample(self, filename, sample_index):
+        super().get_sample(filename, sample_index)
+
+    def next(self):
+        super().next()
+
+    def read_index(self):
+        super().read_index()
     @dlp.log
     def read(self):
         folder = "valid"
@@ -44,7 +57,7 @@ def read(self):
             folder = "train"
         index_folder = f"{self._args.data_folder}/index/{folder}"
         index_files = []
-        for file in self.file_list:
+        for file in self._file_list:
             filename = os.path.basename(file)
             index_files.append(f"{index_folder}/{filename}.idx")
         logging.info(
@@ -61,7 +74,7 @@ def read(self):
         prefetch_size = 1
         if self._args.prefetch_size > 0:
             prefetch_size = self._args.prefetch_size
-        dataset = fn.readers.tfrecord(path=self.file_list,
+        dataset = fn.readers.tfrecord(path=self._file_list,
                                       index_path=index_files,
                                       features={
                                           'image': tfrec.FixedLenFeature((), tfrec.string, ""),
@@ -84,7 +97,7 @@ def _preprocess(self, dataset):
 
     @dlp.log
     def _resize(self, dataset):
-        return nvidia.dali.fn.reshape(dataset, shape=[self._args.max_dimension, self._args.max_dimension])
+        return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension])
 
     @dlp.log
     def finalize(self):
diff --git a/dlio_benchmark/reader/hdf5_reader.py b/dlio_benchmark/reader/hdf5_reader.py
index f95d9b94..b8801abf 100644
--- a/dlio_benchmark/reader/hdf5_reader.py
+++ b/dlio_benchmark/reader/hdf5_reader.py
@@ -58,6 +58,10 @@ def next(self):
     def read_index(self, image_idx, step):
         return super().read_index(image_idx, step)
 
+    @dlp.log
+    def read(self):
+        return super().read()
+
     @dlp.log
     def finalize(self):
         return super().finalize()
diff --git a/dlio_benchmark/reader/image_reader.py b/dlio_benchmark/reader/image_reader.py
index 1fe63a05..2848c868 100644
--- a/dlio_benchmark/reader/image_reader.py
+++ b/dlio_benchmark/reader/image_reader.py
@@ -59,6 +59,10 @@ def next(self):
     def read_index(self, image_idx, step):
         return super().read_index(image_idx, step)
 
+    @dlp.log
+    def read(self):
+        return super().read()
+
     @dlp.log
     def finalize(self):
         return super().finalize()
diff --git a/dlio_benchmark/reader/npy_reader.py b/dlio_benchmark/reader/npy_reader.py
index 435df6b3..931e3355 100644
--- a/dlio_benchmark/reader/npy_reader.py
+++ b/dlio_benchmark/reader/npy_reader.py
@@ -57,4 +57,8 @@ def read_index(self, image_idx, step):
 
     @dlp.log
     def finalize(self):
-        return super().finalize()
\ No newline at end of file
+        return super().finalize()
+    
+    @dlp.log
+    def read(self):
+        return super().read()
\ No newline at end of file
diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py
index f0144f74..678ebd12 100644
--- a/dlio_benchmark/reader/npz_reader.py
+++ b/dlio_benchmark/reader/npz_reader.py
@@ -57,4 +57,8 @@ def read_index(self, image_idx, step):
 
     @dlp.log
     def finalize(self):
-        return super().finalize()
\ No newline at end of file
+        return super().finalize()
+    
+    @dlp.log
+    def read(self):
+        return super().read()
\ No newline at end of file
diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py
index c1404722..e84db142 100644
--- a/dlio_benchmark/reader/reader_factory.py
+++ b/dlio_benchmark/reader/reader_factory.py
@@ -59,8 +59,7 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
                 return NPYReader(dataset_type, thread_index, epoch_number)                         
         elif type == FormatType.NPZ:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
-                print("Loading data of %s format is not supported without framework data loader" %type)
-                raise Exception(type)
+                raise Exception("Loading data of %s format is not supported without framework data loader; please use npy format instead." %type)
             else:
                 from dlio_benchmark.reader.npz_reader import NPZReader
                 return NPZReader(dataset_type, thread_index, epoch_number)
@@ -72,5 +71,4 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
                 from dlio_benchmark.reader.tf_reader import TFReader
                 return TFReader(dataset_type, thread_index, epoch_number) 
         else:
-            print("Loading data of %s format is not supported without framework data loader" %type)
-            raise Exception(type)
+            raise Exception("Loading data of %s format is not supported without framework data loader" %type)
\ No newline at end of file
diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py
index 8084bbdd..93dee926 100644
--- a/dlio_benchmark/reader/reader_handler.py
+++ b/dlio_benchmark/reader/reader_handler.py
@@ -49,6 +49,7 @@ def __init__(self, dataset_type, thread_index):
             FormatReader.read_images = 0
         self.step = 1
         self.image_idx = 0
+        self._file_list = self._args.file_list_train if self.dataset_type is DatasetType.TRAIN else self._args.file_list_eval 
         self.batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval
 
     @dlp.log
@@ -123,9 +124,8 @@ def finalize(self):
             if filename in self.open_file_map:
                 self.close(filename)
                 self.open_file_map[filename] = None
-
+    @abstractmethod
     def read(self):
-        logging.error("This method is not implemented!")
         return
 
     def __del__(self):
diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py
index a889076b..56b9ca58 100644
--- a/dlio_benchmark/reader/tf_reader.py
+++ b/dlio_benchmark/reader/tf_reader.py
@@ -37,9 +37,7 @@ class TFReader(FormatReader):
     def __init__(self, dataset_type, thread_index, epoch):
         super().__init__(dataset_type, thread_index)
         self._dataset = None
-        self._file_list = self._args.file_list_train if self.dataset_type is DatasetType.TRAIN else self._args.file_list_eval
-        self.batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval
-    @dlp.log
+   @dlp.log
     def open(self, filename):
         pass
 
@@ -105,3 +103,7 @@ def read_index(self, image_idx, step):
     @dlp.log
     def finalize(self):
         return super().finalize()
+
+    @dlp.log
+    def read(self):
+        return super().read()
\ No newline at end of file

From 7312c0143c42b36fb2781675367c678ae318c02c Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 21:15:45 +0000
Subject: [PATCH 21/36] added dont_use_mmap

---
 dlio_benchmark/reader/dali_image_reader.py    | 3 ++-
 dlio_benchmark/reader/dali_npy_reader.py      | 3 ++-
 dlio_benchmark/reader/dali_tfrecord_reader.py | 3 ++-
 dlio_benchmark/utils/config.py                | 3 +++
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
index 12c2f1da..54bab5fd 100644
--- a/dlio_benchmark/reader/dali_image_reader.py
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -74,7 +74,8 @@ def read(self):
                                          prefetch_queue_depth=prefetch_size, 
                                          initial_fill=initial_fill, random_shuffle=random_shuffle, 
                                          shuffle_after_epoch=seed_change_epoch, 
-                                         stick_to_shard=stick_to_shard, pad_last_batch=True)
+                                         stick_to_shard=stick_to_shard, pad_last_batch=True, 
+                                         dont_use_mmap=self._args.dont_use_mmap)
         dataset = fn.decoders.image(images, device='cpu')
         dataset = self._preprocess(images)
         dataset = self._resize(images)
diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py
index c2d3bf4f..b7b9947a 100644
--- a/dlio_benchmark/reader/dali_npy_reader.py
+++ b/dlio_benchmark/reader/dali_npy_reader.py
@@ -78,7 +78,8 @@ def read(self):
         dataset = fn.readers.numpy(device='cpu', files=self._file_list, num_shards=self._args.comm_size,
                                    prefetch_queue_depth=prefetch_size, initial_fill=initial_fill,
                                    random_shuffle=random_shuffle, seed=seed, shuffle_after_epoch=seed_change_epoch,
-                                   stick_to_shard=stick_to_shard, pad_last_batch=True)
+                                   stick_to_shard=stick_to_shard, pad_last_batch=True, 
+                                   dont_use_mmap=self._args.dont_use_mmap)
 
         dataset = self._preprocess(dataset)
         dataset = self._resize(dataset)
diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py
index 856d8bfd..01b20c7d 100644
--- a/dlio_benchmark/reader/dali_tfrecord_reader.py
+++ b/dlio_benchmark/reader/dali_tfrecord_reader.py
@@ -83,7 +83,8 @@ def read(self):
                                       prefetch_queue_depth=prefetch_size,
                                       initial_fill=initial_fill,
                                       random_shuffle=random_shuffle, seed=seed,
-                                      stick_to_shard=True, pad_last_batch=True)
+                                      stick_to_shard=True, pad_last_batch=True, 
+                                      dont_use_mmap=self._args.dont_use_mmap)
         dataset = self._preprocess(dataset["image"])
         dataset = self._resize(dataset)
         return dataset
diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
index 05bcd400..eebf8525 100644
--- a/dlio_benchmark/utils/config.py
+++ b/dlio_benchmark/utils/config.py
@@ -78,6 +78,7 @@ class ConfigArguments:
     steps_between_checkpoints: int = -1
     transfer_size: int = None
     read_threads: int = 1
+    dont_use_mmap: bool = False
     computation_threads: int = 1
     computation_time: float = 0.
     computation_time_stdev: float = 0.
@@ -349,6 +350,8 @@ def LoadConfig(args, config):
     elif 'reader' in config:
         reader = config['reader']
     if reader is not None:
+        if 'dont_use_mmap' in reader:
+            args.dont_use_mmap = reader['dont_use_mmap']
         if 'reader_classname' in reader:
             args.reader_classname = reader['reader_classname']
         if 'multiprocessing_context' in reader:

From 02c3855b784f7e044b0e40d8089b870ac78854cc Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 21:23:25 +0000
Subject: [PATCH 22/36] fixed indent

---
 dlio_benchmark/reader/tf_reader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py
index 56b9ca58..f40dbdcc 100644
--- a/dlio_benchmark/reader/tf_reader.py
+++ b/dlio_benchmark/reader/tf_reader.py
@@ -37,7 +37,8 @@ class TFReader(FormatReader):
     def __init__(self, dataset_type, thread_index, epoch):
         super().__init__(dataset_type, thread_index)
         self._dataset = None
-   @dlp.log
+    
+    @dlp.log
     def open(self, filename):
         pass
 

From 60c508c59a38003ee2ebbfd22ffc9c5f1aa1d02d Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 21:26:17 +0000
Subject: [PATCH 23/36] fixed csvreader

---
 dlio_benchmark/reader/csv_reader.py     | 4 ++++
 dlio_benchmark/reader/reader_handler.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/dlio_benchmark/reader/csv_reader.py b/dlio_benchmark/reader/csv_reader.py
index be1e587b..cea1cd17 100644
--- a/dlio_benchmark/reader/csv_reader.py
+++ b/dlio_benchmark/reader/csv_reader.py
@@ -58,3 +58,7 @@ def read_index(self, image_idx, step):
     @dlp.log
     def finalize(self):
         return super().finalize()
+
+    @dlp.log
+    def read(self):
+        return super().read()
\ No newline at end of file
diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py
index 93dee926..3ecd79c2 100644
--- a/dlio_benchmark/reader/reader_handler.py
+++ b/dlio_benchmark/reader/reader_handler.py
@@ -124,6 +124,7 @@ def finalize(self):
             if filename in self.open_file_map:
                 self.close(filename)
                 self.open_file_map[filename] = None
+
     @abstractmethod
     def read(self):
         return

From 5e96841793dfecefb4fd9f951e627b40b7c8227d Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 21:41:10 +0000
Subject: [PATCH 24/36] native_dali test with npy format instead of npz

---
 .github/workflows/python-package-conda.yml    | 12 ++---
 .../data_generator/npy_generator.py           | 53 +++++++++++++++++++
 2 files changed, 59 insertions(+), 6 deletions(-)
 create mode 100644 dlio_benchmark/data_generator/npy_generator.py

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index b649696c..5ac1ed5d 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -154,16 +154,16 @@ jobs:
         source ${VENV}/bin/activate
         mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
         mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
-    - name: test-torch-native-dali-loader-npz
+    - name: test-torch-native-dali-loader-npy
       run: |
         source ${VENV}/bin/activate
-        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
-        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
-    - name: test-tf-native-dali-loader-npz
+        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.dataset.format=npy ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.reader.data_loader=native_dali ++workload.dataset.format=npy ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+    - name: test-tf-native-dali-loader-npy
       run: |
         source ${VENV}/bin/activate
-        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
-        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.dataset.format=npy ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
+        mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.dataset.format=npy ++workload.reader.data_loader=native_dali ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=1 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2  ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
     - name: test_subset
       run: |
         source ${VENV}/bin/activate
diff --git a/dlio_benchmark/data_generator/npy_generator.py b/dlio_benchmark/data_generator/npy_generator.py
new file mode 100644
index 00000000..d60ec2f3
--- /dev/null
+++ b/dlio_benchmark/data_generator/npy_generator.py
@@ -0,0 +1,53 @@
+"""
+   Copyright (c) 2022, UChicago Argonne, LLC
+   All Rights Reserved
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+
+from dlio_benchmark.common.enumerations import Compression
+from dlio_benchmark.data_generator.data_generator import DataGenerator
+
+import logging
+import numpy as np
+
+from dlio_benchmark.utils.utility import progress, utcnow
+from dlio_profiler.logger import fn_interceptor as Profile
+from shutil import copyfile
+from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
+
+dlp = Profile(MODULE_DATA_GENERATOR)
+
+"""
+Generator for creating data in NPZ format.
+"""
+class NPYGenerator(DataGenerator):
+    def __init__(self):
+        super().__init__()
+
+    @dlp.log
+    def generate(self):
+        """
+        Generator for creating data in NPY format of 3d dataset.
+        """
+        super().generate()
+        np.random.seed(10)
+        record_labels = [0] * self.num_samples
+        for i in dlp.iter(range(self.my_rank, int(self.total_files_to_generate), self.comm_size)):
+            dim1, dim2 = self.get_dimension()
+            records = np.random.randint(255, size=(dim1, dim2, self.num_samples), dtype=np.uint8)
+            out_path_spec = self.storage.get_uri(self._file_list[i])
+            progress(i+1, self.total_files_to_generate, "Generating NPY Data")
+            prev_out_spec = out_path_spec
+            np.save(out_path_spec, records)
+        np.random.seed()

From b1412e306da96895233e7f8b03de1a5b14eb34c9 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 21:53:23 +0000
Subject: [PATCH 25/36] fixed issue of enum

---
 dlio_benchmark/common/enumerations.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark/common/enumerations.py b/dlio_benchmark/common/enumerations.py
index 60ba5f63..9fab8f23 100644
--- a/dlio_benchmark/common/enumerations.py
+++ b/dlio_benchmark/common/enumerations.py
@@ -101,7 +101,7 @@ class FormatType(Enum):
     def __str__(self):
         return self.value
 
-    @ staticmethod
+    @staticmethod
     def get_enum(value):
         if FormatType.TFRECORD.value == value:
             return FormatType.TFRECORD

From d659e5fcafdfa536875e9af2cc77b8d0c9028c53 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 21:57:56 +0000
Subject: [PATCH 26/36] modify action so that dlio will always be installed

---
 .github/workflows/python-package-conda.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index 5ac1ed5d..6f584b8f 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -50,7 +50,6 @@ jobs:
         sudo apt-get install $CC $CXX libc6
         sudo apt-get install mpich libhwloc-dev
     - name: Install DLIO
-      if: steps.cache-modules.outputs.cache-hit != 'true'
       run: |
         echo "Profiler ${DLIO_PROFILER} gcc $CC"
         python -m pip install --upgrade pip

From 96fa9c3cff539e673a705c42a8fff7dc54f160fb Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Mon, 11 Dec 2023 23:32:19 +0000
Subject: [PATCH 27/36] [skip ci] added documentation for dali

---
 README.md              | 2 +-
 docs/source/config.rst | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3ae53fbb..5990ff57 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ dlio_benchmark ++workload.workflow.generate_data=True
 git clone https://github.com/argonne-lcf/dlio_benchmark
 cd dlio_benchmark/
 pip install .[dlio_profiler]
-
+```
 ## Container
 
 ```bash
diff --git a/docs/source/config.rst b/docs/source/config.rst
index 920b6590..9d245ca2 100644
--- a/docs/source/config.rst
+++ b/docs/source/config.rst
@@ -189,7 +189,7 @@ reader
      - Description
    * - data_loader
      - tensorflow
-     - select the data loader to use [tensorflow|pytorch|dali]. 
+     - select the data loader to use [tensorflow|pytorch|dali|native_dali]. 
    * - batch_size
      - 1 
      - batch size for training
@@ -227,6 +227,8 @@ reader
 
   For pytorch, ``prefetch_size`` is set to be 0, it will be changed to 2. In other words, the default value for ``prefetch_size`` in pytorch is 2. 
 
+  For Dali data loader, we support two options, ``dali`` and ``native_dali```. ``dali`` uses our internal reader, such as ``jpeg_reader``, ``hdf5_reader``, etc, and ``dali.fn.external_source``; whereas ``native_dali`` directly uses Dali readers, such as ``dn.readers.numpy``, ``fn.readers.tfrecord``, and ``fn.readers.file``. 
+
 
 train
 ------------------

From f9aaac28a93e9984261c93f5c0bf498dd3489fd6 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 12 Dec 2023 16:19:38 +0000
Subject: [PATCH 28/36] removed read; and define it as pipeline

---
 .../data_loader/native_dali_data_loader.py    |  6 +--
 dlio_benchmark/reader/csv_reader.py           |  6 +--
 dlio_benchmark/reader/dali_image_reader.py    | 21 +++++----
 dlio_benchmark/reader/dali_npy_reader.py      | 44 +++++++++----------
 dlio_benchmark/reader/dali_tfrecord_reader.py | 33 +++++++-------
 dlio_benchmark/reader/hdf5_reader.py          | 13 ++----
 dlio_benchmark/reader/image_reader.py         |  4 --
 dlio_benchmark/reader/npy_reader.py           |  6 +--
 dlio_benchmark/reader/npz_reader.py           |  6 +--
 dlio_benchmark/reader/reader_handler.py       |  6 +--
 dlio_benchmark/reader/tf_reader.py            | 10 ++---
 11 files changed, 60 insertions(+), 95 deletions(-)

diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py
index 8799800b..900c8c6d 100644
--- a/dlio_benchmark/data_loader/native_dali_data_loader.py
+++ b/dlio_benchmark/data_loader/native_dali_data_loader.py
@@ -38,9 +38,9 @@ def read(self):
                                exec_async=False, exec_pipelined=False)
         with pipeline:
             images = ReaderFactory.get_reader(type=self.format_type,
-                                                      dataset_type=self.dataset_type,
-                                                      thread_index=-1,
-                                                      epoch_number=self.epoch_number).read()
+                                            dataset_type=self.dataset_type,
+                                            thread_index=-1,
+                                            epoch_number=self.epoch_number).pipeline()
             pipeline.set_outputs(images)
         self.pipelines.append(pipeline)
 
diff --git a/dlio_benchmark/reader/csv_reader.py b/dlio_benchmark/reader/csv_reader.py
index cea1cd17..0690fc69 100644
--- a/dlio_benchmark/reader/csv_reader.py
+++ b/dlio_benchmark/reader/csv_reader.py
@@ -57,8 +57,4 @@ def read_index(self, image_idx, step):
 
     @dlp.log
     def finalize(self):
-        return super().finalize()
-
-    @dlp.log
-    def read(self):
-        return super().read()
\ No newline at end of file
+        return super().finalize()
\ No newline at end of file
diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
index 54bab5fd..02f0756b 100644
--- a/dlio_benchmark/reader/dali_image_reader.py
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -16,7 +16,8 @@
 """
 import math
 import logging
-from time import time
+from time import time, sleep
+import numpy as np
 
 import nvidia.dali.fn as fn
 from dlio_benchmark.common.constants import MODULE_DATA_READER
@@ -33,9 +34,11 @@ class DaliImageReader(FormatReader):
     @dlp.log_init
     def __init__(self, dataset_type, thread_index, epoch):
         super().__init__(dataset_type, thread_index)
-    def open(self):
-        super().open()
 
+    @dlp.log        
+    def open(self, filename):
+        super().open(filename)
+    
     def close(self):
         super().close()
     
@@ -49,7 +52,7 @@ def read_index(self):
         super().read_index()
 
     @dlp.log
-    def read(self):
+    def pipeline(self):
         logging.debug(
             f"{utcnow()} Reading {len(self._file_list)} files rank {self._args.my_rank}")
         random_shuffle = False
@@ -76,17 +79,13 @@ def read(self):
                                          shuffle_after_epoch=seed_change_epoch, 
                                          stick_to_shard=stick_to_shard, pad_last_batch=True, 
                                          dont_use_mmap=self._args.dont_use_mmap)
-        dataset = fn.decoders.image(images, device='cpu')
-        dataset = self._preprocess(images)
+        images = fn.decoders.image(images, device='cpu')
         dataset = self._resize(images)
         return dataset
 
     @dlp.log
-    def _preprocess(self, dataset):
-        if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.:
-            t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev)
-            sleep(max(t, 0.0))
-        return dataset
+    def preprocess(self):
+        raise Exception("Emulated preprocessing method is not implemented in dali readers")
 
     @dlp.log
     def _resize(self, dataset):
diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py
index b7b9947a..00ceada6 100644
--- a/dlio_benchmark/reader/dali_npy_reader.py
+++ b/dlio_benchmark/reader/dali_npy_reader.py
@@ -16,7 +16,8 @@
 """
 import math
 import logging
-from time import time
+from time import time, sleep
+import numpy as np
 
 import nvidia.dali.fn as fn
 from dlio_benchmark.common.constants import MODULE_DATA_READER
@@ -34,23 +35,12 @@ class DaliNPYReader(FormatReader):
     def __init__(self, dataset_type, thread_index, epoch):
         super().__init__(dataset_type, thread_index)
 
-    def open(self):
-        super().open()
-
-    def close(self):
-        super().close()
-    
-    def get_sample(self, filename, sample_index):
-        super().get_sample(filename, sample_index)
-
-    def next(self):
-        super().next()
-
-    def read_index(self):
-        super().read_index()
+    @dlp.log
+    def open(self, filename):
+        super().open(filename)
 
     @dlp.log
-    def read(self):
+    def pipeline(self):
         logging.debug(
             f"{utcnow()} Reading {len(self._file_list)} files rank {self._args.my_rank}")
         random_shuffle = False
@@ -74,24 +64,30 @@ def read(self):
         if seed_change_epoch:
             stick_to_shard = False
 
-
         dataset = fn.readers.numpy(device='cpu', files=self._file_list, num_shards=self._args.comm_size,
                                    prefetch_queue_depth=prefetch_size, initial_fill=initial_fill,
                                    random_shuffle=random_shuffle, seed=seed, shuffle_after_epoch=seed_change_epoch,
                                    stick_to_shard=stick_to_shard, pad_last_batch=True, 
                                    dont_use_mmap=self._args.dont_use_mmap)
-
-        dataset = self._preprocess(dataset)
         dataset = self._resize(dataset)
         return dataset
+
+    def close(self):
+        super().close()
     
+    def get_sample(self, filename, sample_index):
+        super().get_sample(filename, sample_index)
+
+    def next(self):
+        super().next()
+
+    def read_index(self):
+        super().read_index()   
 
     @dlp.log
-    def _preprocess(self, dataset):
-        if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.:
-            t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev)
-            sleep(max(t, 0.0))
-        return dataset
+    def preprocess(self, dataset):
+        raise Exception("Emulated preprocessing method is not implemented in dali readers")
+
 
     @dlp.log
     def _resize(self, dataset):
diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py
index 01b20c7d..e7de0d32 100644
--- a/dlio_benchmark/reader/dali_tfrecord_reader.py
+++ b/dlio_benchmark/reader/dali_tfrecord_reader.py
@@ -18,7 +18,8 @@
 
 import math
 import logging
-from time import time
+from time import time, sleep
+import numpy as np
 
 import nvidia
 import nvidia.dali.fn as fn
@@ -33,25 +34,25 @@
 
 
 class DaliTFRecordReader(FormatReader):
+    """
+    Reader for NPZ files
+    """    
     @dlp.log_init
     def __init__(self, dataset_type, thread_index, epoch):
         super().__init__(dataset_type, thread_index)
-    def open(self):
-        super().open()
 
+    @dlp.log
+    def open(self, filename):
+        super().open(filename)
+    
     def close(self):
         super().close()
     
     def get_sample(self, filename, sample_index):
         super().get_sample(filename, sample_index)
 
-    def next(self):
-        super().next()
-
-    def read_index(self):
-        super().read_index()
     @dlp.log
-    def read(self):
+    def pipeline(self):
         folder = "valid"
         if self.dataset_type == DatasetType.TRAIN:
             folder = "train"
@@ -85,16 +86,16 @@ def read(self):
                                       random_shuffle=random_shuffle, seed=seed,
                                       stick_to_shard=True, pad_last_batch=True, 
                                       dont_use_mmap=self._args.dont_use_mmap)
-        dataset = self._preprocess(dataset["image"])
-        dataset = self._resize(dataset)
+        dataset = self._resize(dataset['image'])
         return dataset
 
+    def read_index(self):
+        super().read_index()
+        raise Exception("read_index method is not implemented")
+
     @dlp.log
-    def _preprocess(self, dataset):
-        if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.:
-            t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev)
-            sleep(max(t, 0.0))
-        return dataset
+    def preprocess(self, dataset):
+        raise Exception("Emulated preprocessing method is not implemented in dali readers")
 
     @dlp.log
     def _resize(self, dataset):
diff --git a/dlio_benchmark/reader/hdf5_reader.py b/dlio_benchmark/reader/hdf5_reader.py
index b8801abf..77dd7301 100644
--- a/dlio_benchmark/reader/hdf5_reader.py
+++ b/dlio_benchmark/reader/hdf5_reader.py
@@ -24,13 +24,10 @@
 
 dlp = Profile(MODULE_DATA_READER)
 
-"""
-Reader for HDF5 files for training file.
-"""
-
-
 class HDF5Reader(FormatReader):
-
+    """
+    Reader for HDF5 files.
+    """
     @dlp.log_init
     def __init__(self, dataset_type, thread_index, epoch):
         super().__init__(dataset_type, thread_index)
@@ -58,10 +55,6 @@ def next(self):
     def read_index(self, image_idx, step):
         return super().read_index(image_idx, step)
 
-    @dlp.log
-    def read(self):
-        return super().read()
-
     @dlp.log
     def finalize(self):
         return super().finalize()
diff --git a/dlio_benchmark/reader/image_reader.py b/dlio_benchmark/reader/image_reader.py
index 2848c868..1fe63a05 100644
--- a/dlio_benchmark/reader/image_reader.py
+++ b/dlio_benchmark/reader/image_reader.py
@@ -59,10 +59,6 @@ def next(self):
     def read_index(self, image_idx, step):
         return super().read_index(image_idx, step)
 
-    @dlp.log
-    def read(self):
-        return super().read()
-
     @dlp.log
     def finalize(self):
         return super().finalize()
diff --git a/dlio_benchmark/reader/npy_reader.py b/dlio_benchmark/reader/npy_reader.py
index 931e3355..d3cc46f1 100644
--- a/dlio_benchmark/reader/npy_reader.py
+++ b/dlio_benchmark/reader/npy_reader.py
@@ -25,7 +25,7 @@
 
 class NPYReader(FormatReader):
     """
-    Reader for NPZ files
+    Reader for NPY files
     """
 
     @dlp.log_init
@@ -58,7 +58,3 @@ def read_index(self, image_idx, step):
     @dlp.log
     def finalize(self):
         return super().finalize()
-    
-    @dlp.log
-    def read(self):
-        return super().read()
\ No newline at end of file
diff --git a/dlio_benchmark/reader/npz_reader.py b/dlio_benchmark/reader/npz_reader.py
index 678ebd12..f0144f74 100644
--- a/dlio_benchmark/reader/npz_reader.py
+++ b/dlio_benchmark/reader/npz_reader.py
@@ -57,8 +57,4 @@ def read_index(self, image_idx, step):
 
     @dlp.log
     def finalize(self):
-        return super().finalize()
-    
-    @dlp.log
-    def read(self):
-        return super().read()
\ No newline at end of file
+        return super().finalize()
\ No newline at end of file
diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py
index 3ecd79c2..0c1c41ae 100644
--- a/dlio_benchmark/reader/reader_handler.py
+++ b/dlio_benchmark/reader/reader_handler.py
@@ -60,7 +60,7 @@ def preprocess(self):
 
     @abstractmethod
     def open(self, filename):
-        pass
+        return 
 
     @abstractmethod
     def close(self, filename):
@@ -125,10 +125,6 @@ def finalize(self):
                 self.close(filename)
                 self.open_file_map[filename] = None
 
-    @abstractmethod
-    def read(self):
-        return
-
     def __del__(self):
         self.thread_index = None
         self._args = None
diff --git a/dlio_benchmark/reader/tf_reader.py b/dlio_benchmark/reader/tf_reader.py
index f40dbdcc..d19ea96a 100644
--- a/dlio_benchmark/reader/tf_reader.py
+++ b/dlio_benchmark/reader/tf_reader.py
@@ -55,7 +55,7 @@ def resize_sample(self, filename, sample_index):
         pass
 
     @dlp.log
-    def parse_image(self, serialized):
+    def _parse_image(self, serialized):
         """
         performs deserialization of the tfrecord.
         :param serialized: is the serialized version using protobuf
@@ -85,7 +85,7 @@ def next(self):
         self._dataset = tf.data.TFRecordDataset(filenames=self._file_list, buffer_size=self._args.transfer_size)
         self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank)
         self._dataset = self._dataset.map(
-            lambda x: tf.py_function(func=self.parse_image, inp=[x], Tout=[tf.uint8])
+            lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8])
             , num_parallel_calls=self._args.computation_threads)
         self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True)
         total = math.ceil(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file)
@@ -103,8 +103,4 @@ def read_index(self, image_idx, step):
 
     @dlp.log
     def finalize(self):
-        return super().finalize()
-
-    @dlp.log
-    def read(self):
-        return super().read()
\ No newline at end of file
+        return super().finalize()
\ No newline at end of file

From ca760fe175a46731d39c7645e6e4995b64ec925d Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Tue, 12 Dec 2023 16:22:25 +0000
Subject: [PATCH 29/36] added exceptions for unimplemented methods

---
 dlio_benchmark/reader/dali_image_reader.py    |  3 +++
 dlio_benchmark/reader/dali_npy_reader.py      |  5 ++++-
 dlio_benchmark/reader/dali_tfrecord_reader.py | 13 +++++++++----
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
index 02f0756b..34c9bf7b 100644
--- a/dlio_benchmark/reader/dali_image_reader.py
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -44,12 +44,15 @@ def close(self):
     
     def get_sample(self, filename, sample_index):
         super().get_sample(filename, sample_index)
+        raise Exception("get sample method is not implemented in dali readers")
 
     def next(self):
         super().next()
+        raise Exception("next method is not implemented in dali readers")
 
     def read_index(self):
         super().read_index()
+        raise Exception("read_index method is not implemented in dali readers")
 
     @dlp.log
     def pipeline(self):
diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py
index 00ceada6..b69ea602 100644
--- a/dlio_benchmark/reader/dali_npy_reader.py
+++ b/dlio_benchmark/reader/dali_npy_reader.py
@@ -77,12 +77,15 @@ def close(self):
     
     def get_sample(self, filename, sample_index):
         super().get_sample(filename, sample_index)
+        raise Exception("get sample method is not implemented in dali readers")
 
     def next(self):
         super().next()
+        raise Exception("next method is not implemented in dali readers")
 
     def read_index(self):
-        super().read_index()   
+        super().read_index()
+        raise Exception("read_index method is not implemented in dali readers")
 
     @dlp.log
     def preprocess(self, dataset):
diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py
index e7de0d32..dfb3c23e 100644
--- a/dlio_benchmark/reader/dali_tfrecord_reader.py
+++ b/dlio_benchmark/reader/dali_tfrecord_reader.py
@@ -47,9 +47,6 @@ def open(self, filename):
     
     def close(self):
         super().close()
-    
-    def get_sample(self, filename, sample_index):
-        super().get_sample(filename, sample_index)
 
     @dlp.log
     def pipeline(self):
@@ -89,9 +86,17 @@ def pipeline(self):
         dataset = self._resize(dataset['image'])
         return dataset
 
+    def get_sample(self, filename, sample_index):
+        super().get_sample(filename, sample_index)
+        raise Exception("get sample method is not implemented in dali readers")
+
+    def next(self):
+        super().next()
+        raise Exception("next method is not implemented in dali readers")
+
     def read_index(self):
         super().read_index()
-        raise Exception("read_index method is not implemented")
+        raise Exception("read_index method is not implemented in dali readers")
 
     @dlp.log
     def preprocess(self, dataset):

From a8ba464968c626836e67f134b2e608c6a75c54f7 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Thu, 14 Dec 2023 00:09:40 +0000
Subject: [PATCH 30/36] added preprocessing

---
 dlio_benchmark/reader/dali_image_reader.py    | 5 +----
 dlio_benchmark/reader/dali_npy_reader.py      | 8 ++------
 dlio_benchmark/reader/dali_tfrecord_reader.py | 5 +----
 dlio_benchmark/reader/reader_handler.py       | 2 +-
 4 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/dlio_benchmark/reader/dali_image_reader.py b/dlio_benchmark/reader/dali_image_reader.py
index 34c9bf7b..032885cd 100644
--- a/dlio_benchmark/reader/dali_image_reader.py
+++ b/dlio_benchmark/reader/dali_image_reader.py
@@ -83,13 +83,10 @@ def pipeline(self):
                                          stick_to_shard=stick_to_shard, pad_last_batch=True, 
                                          dont_use_mmap=self._args.dont_use_mmap)
         images = fn.decoders.image(images, device='cpu')
+        fn.python_function(dataset, function=self.preprocess, num_outputs=0)
         dataset = self._resize(images)
         return dataset
 
-    @dlp.log
-    def preprocess(self):
-        raise Exception("Emulated preprocessing method is not implemented in dali readers")
-
     @dlp.log
     def _resize(self, dataset):
         return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension])
diff --git a/dlio_benchmark/reader/dali_npy_reader.py b/dlio_benchmark/reader/dali_npy_reader.py
index b69ea602..e68f4fb2 100644
--- a/dlio_benchmark/reader/dali_npy_reader.py
+++ b/dlio_benchmark/reader/dali_npy_reader.py
@@ -70,6 +70,7 @@ def pipeline(self):
                                    stick_to_shard=stick_to_shard, pad_last_batch=True, 
                                    dont_use_mmap=self._args.dont_use_mmap)
         dataset = self._resize(dataset)
+        fn.python_function(dataset, function=self.preprocess, num_outputs=0)
         return dataset
 
     def close(self):
@@ -86,12 +87,7 @@ def next(self):
     def read_index(self):
         super().read_index()
         raise Exception("read_index method is not implemented in dali readers")
-
-    @dlp.log
-    def preprocess(self, dataset):
-        raise Exception("Emulated preprocessing method is not implemented in dali readers")
-
-
+    
     @dlp.log
     def _resize(self, dataset):
         return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension])
diff --git a/dlio_benchmark/reader/dali_tfrecord_reader.py b/dlio_benchmark/reader/dali_tfrecord_reader.py
index dfb3c23e..0a33f327 100644
--- a/dlio_benchmark/reader/dali_tfrecord_reader.py
+++ b/dlio_benchmark/reader/dali_tfrecord_reader.py
@@ -84,6 +84,7 @@ def pipeline(self):
                                       stick_to_shard=True, pad_last_batch=True, 
                                       dont_use_mmap=self._args.dont_use_mmap)
         dataset = self._resize(dataset['image'])
+        fn.python_function(dataset, function=self.preprocess, num_outputs=0)
         return dataset
 
     def get_sample(self, filename, sample_index):
@@ -98,10 +99,6 @@ def read_index(self):
         super().read_index()
         raise Exception("read_index method is not implemented in dali readers")
 
-    @dlp.log
-    def preprocess(self, dataset):
-        raise Exception("Emulated preprocessing method is not implemented in dali readers")
-
     @dlp.log
     def _resize(self, dataset):
         return fn.resize(dataset, size=[self._args.max_dimension, self._args.max_dimension])
diff --git a/dlio_benchmark/reader/reader_handler.py b/dlio_benchmark/reader/reader_handler.py
index 0c1c41ae..57884dd5 100644
--- a/dlio_benchmark/reader/reader_handler.py
+++ b/dlio_benchmark/reader/reader_handler.py
@@ -53,7 +53,7 @@ def __init__(self, dataset_type, thread_index):
         self.batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval
 
     @dlp.log
-    def preprocess(self):
+    def preprocess(self, a=None):
         if self._args.preprocess_time != 0. or self._args.preprocess_time_stdev != 0.:
             t = np.random.normal(self._args.preprocess_time, self._args.preprocess_time_stdev)
             sleep(max(t, 0.0))

From 1f0315968369e63aa29cb6e76ac99a3c1bd765a3 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Thu, 14 Dec 2023 15:54:02 +0000
Subject: [PATCH 31/36] conditional cache for DLIO installation

---
 .github/workflows/python-package-conda.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index 6f584b8f..8c53dd0f 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -50,6 +50,7 @@ jobs:
         sudo apt-get install $CC $CXX libc6
         sudo apt-get install mpich libhwloc-dev
     - name: Install DLIO
+      if: steps.cache-modules.outputs.cache-hit != 'true'
       run: |
         echo "Profiler ${DLIO_PROFILER} gcc $CC"
         python -m pip install --upgrade pip
@@ -57,7 +58,12 @@ jobs:
         python -m venv ${VENV}
         source ${VENV}/bin/activate
         pip install .[test]
-        rm -rf dlio_benchmark
+      if: steps.cache-modules.outputs.cache-hit == 'true'    
+      run: |
+        source ${VENV}/bin/activate
+        rm -rf *.egg*
+        rm -rf build
+        python setup.py install
     - name: Install DLIO Profiler
       run: |
         echo "Profiler ${DLIO_PROFILER} gcc $CC"

From 5dd6ebfbda09b09dd8d31f3184574968021377f4 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Thu, 14 Dec 2023 15:57:26 +0000
Subject: [PATCH 32/36] fixed bugs

---
 .github/workflows/python-package-conda.yml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index 8c53dd0f..095573e7 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -49,6 +49,13 @@ jobs:
         sudo apt update
         sudo apt-get install $CC $CXX libc6
         sudo apt-get install mpich libhwloc-dev
+    - name: Install DLIO code only 
+      if: steps.cache-modules.outputs.cache-hit == 'true'    
+      run: |
+        source ${VENV}/bin/activate
+        rm -rf *.egg*
+        rm -rf build
+        python setup.py install
     - name: Install DLIO
       if: steps.cache-modules.outputs.cache-hit != 'true'
       run: |
@@ -57,13 +64,7 @@ jobs:
         pip install virtualenv
         python -m venv ${VENV}
         source ${VENV}/bin/activate
-        pip install .[test]
-      if: steps.cache-modules.outputs.cache-hit == 'true'    
-      run: |
-        source ${VENV}/bin/activate
-        rm -rf *.egg*
-        rm -rf build
-        python setup.py install
+        pip install .[test]       
     - name: Install DLIO Profiler
       run: |
         echo "Profiler ${DLIO_PROFILER} gcc $CC"

From 6af315b29fefd8883c45b5ff518c241aa852b700 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Thu, 14 Dec 2023 16:06:23 +0000
Subject: [PATCH 33/36] fixed bugs

---
 .github/workflows/python-package-conda.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index 095573e7..d8314b73 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -47,14 +47,16 @@ jobs:
     - name: Install System Tools
       run: |
         sudo apt update
-        sudo apt-get install $CC $CXX libc6
+        sudo apt-get install $CC $CXX libc6 git
         sudo apt-get install mpich libhwloc-dev
     - name: Install DLIO code only 
       if: steps.cache-modules.outputs.cache-hit == 'true'    
       run: |
+        git pull
         source ${VENV}/bin/activate
         rm -rf *.egg*
         rm -rf build
+        python setup.py build
         python setup.py install
     - name: Install DLIO
       if: steps.cache-modules.outputs.cache-hit != 'true'

From 8dc2b71ed8956ba7768d91f10dc1c1d3d57de225 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Thu, 14 Dec 2023 16:08:28 +0000
Subject: [PATCH 34/36] fixed bugs

---
 .github/workflows/python-package-conda.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index d8314b73..2c4dd4a0 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -52,7 +52,6 @@ jobs:
     - name: Install DLIO code only 
       if: steps.cache-modules.outputs.cache-hit == 'true'    
       run: |
-        git pull
         source ${VENV}/bin/activate
         rm -rf *.egg*
         rm -rf build

From 7e72ed5cdbc51496cc9e90aa9d8a118e5db66f56 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Thu, 14 Dec 2023 16:24:52 +0000
Subject: [PATCH 35/36] fixing again

---
 .github/workflows/python-package-conda.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index 2c4dd4a0..7b78713d 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -55,6 +55,8 @@ jobs:
         source ${VENV}/bin/activate
         rm -rf *.egg*
         rm -rf build
+        rm -rf dist
+        pip uninstall dlio_benchmark
         python setup.py build
         python setup.py install
     - name: Install DLIO

From 4e727a85c0b5f1cb86efb2a50ef36322f41e885c Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Thu, 14 Dec 2023 19:52:44 +0000
Subject: [PATCH 36/36] tests again

---
 .github/workflows/python-package-conda.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
index 7b78713d..40dd2091 100644
--- a/.github/workflows/python-package-conda.yml
+++ b/.github/workflows/python-package-conda.yml
@@ -56,7 +56,7 @@ jobs:
         rm -rf *.egg*
         rm -rf build
         rm -rf dist
-        pip uninstall dlio_benchmark
+        pip uninstall -y dlio_benchmark
         python setup.py build
         python setup.py install
     - name: Install DLIO