From dd187e937d5edd1a0c824ab89685b7a1dcb689d5 Mon Sep 17 00:00:00 2001
From: Huihuo Zheng <zhenghh04@gmail.com>
Date: Wed, 16 Nov 2022 00:13:31 -0600
Subject: [PATCH] Fixed some typos in the documentation; removed tests/configs
 to avoid potential conflits

---
 docs/source/config.rst                  | 24 ++++++++----
 docs/source/examples.rst                |  4 +-
 docs/source/index.rst                   | 12 +++---
 docs/source/overview.rst                | 52 +++++++++++++++----------
 src/data_generator/data_generator.py    |  8 ++++
 src/dlio_benchmark.py                   | 40 ++++++++++++-------
 tests/configs/config.yaml               |  9 -----
 tests/configs/workload/bert_test.yaml   | 30 --------------
 tests/configs/workload/unet3d_test.yaml | 32 ---------------
 tests/dlio_benchmark_test.py            | 13 +++++--
 10 files changed, 100 insertions(+), 124 deletions(-)
 delete mode 100644 tests/configs/config.yaml
 delete mode 100644 tests/configs/workload/bert_test.yaml
 delete mode 100644 tests/configs/workload/unet3d_test.yaml

diff --git a/docs/source/config.rst b/docs/source/config.rst
index 0abbb98a..e87dde18 100644
--- a/docs/source/config.rst
+++ b/docs/source/config.rst
@@ -2,7 +2,7 @@
 
 DLIO Configuration
 ==============================================
-The characteristics of a workload is specified through a YAML file. This will then be read by the DLIO program to setup the benchmark. Below is an example of such a YAML file. More examples can be found in the `workload`_ folder. 
+The characteristics of a workload is specified through a YAML file. This file will then be read by `DLIO` to setup the benchmark. Below is an example of such a YAML file. More examples can be found in the `workload`_ folder. 
 
 .. code-block:: yaml
   
@@ -11,7 +11,7 @@ The characteristics of a workload is specified through a YAML file. This will th
   framework: pytorch
 
   workflow:
-    generate_data: True
+    generate_data: False
     train: True
     evaluation: True
 
@@ -40,11 +40,11 @@ The characteristics of a workload is specified through a YAML file. This will th
     eval_time: 11.572
     epochs_between_evals: 2
 
-A DLIO YAML configuration file contains following sections: 
+A `DLIO` YAML configuration file contains following sections: 
 
 * **model** - specifying the name of the model.
 * **framework** - specifying the framework to use for the benchmark, options: tensorflow, pytorch
-* **workflow** - specifying what workflow operations to perform, including dataset generation, training, evaluation, checkpointing, evaluation, debugging, etc. 
+* **workflow** - specifying what workflow operations to perform. Workflow operations include: dataset generation (``generate_data``), training (``train``), evaluation (``evaluation``), checkpointing (``checkpoint``), debugging (``debug``), etc. 
 * **dataset** - specifying all the information related to the dataset. 
 * **data_reader** - specifying the data loading options 
 * **train** - specifying the setup for training
@@ -54,22 +54,26 @@ A DLIO YAML configuration file contains following sections:
 
 model
 ------------------
-No other parameters under this section. 
 One can specify the name of the model as 
 
 .. code-block:: yaml
 
   model: unet3d
 
+No other parameters under this section. 
+
+
 framework
 -------------------
-No parameters under this group. 
 Specify the frameork (tensorflow or pytorch) as 
 
 .. code-block:: yaml
 
   framework: tensorflow
 
+No parameters under this group. 
+
+
 workflow
 ------------------
 .. list-table:: 
@@ -127,7 +131,7 @@ dataset
      - number of samples per file
    * - data_folder
      - ./data
-     - the path to store the dataset
+     - the path to store the dataset. 
    * - num_subfolders_train
      - 0
      - number of subfolders that the training set is stored
@@ -159,6 +163,10 @@ dataset
      - True
      - whether to keep the dataset files afer the simulation.    
 
+.. note :: 
+  The training and validation datasets will be put in ```${data_folder}/train``` and ```${data_folder}/valid``` respectively. If ``num_subfolders_train`` and ``num_subfolders_eval`` are larger than one, the datasets will be split into multiple subfolders within ```${data_folder}/train``` and ```${data_folder}/valid```. 
+
+
 data_reader 
 ------------------
 .. list-table:: 
@@ -197,7 +205,7 @@ data_reader
 
   If ``none`` is set for ``data_reader.data_loader``, then custom 
   data reader such as ``npz_reader``, ``csv_reader``, ``hdf5_reader`` will be used. 
-  Currently, these custom readers do not support advance features
+  Currently, these custom readers do not support advanced features
   such as multiple read_threads, prefetch, etc. 
 
 train
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index 158b0d14..6b09902b 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -1,7 +1,7 @@
 Examples
 =============
 
-We here list a set of workloads. In the first example, we show the whole process of generating the data, running the benchmark with profiling, and processing the logs and profiling data. For the rest of the workloads, we provide the YAML configure file.
+We here list a set of example workloads. In the first example, we show the benchmarking process, including generating the dataset, running the benchmark with profiling, and processing the logs and profiling data. For the rest of the workloads, we list the YAML configure files.
 
 UNET3D: 3D Medical Image Segmentation
 ---------------------------------------
@@ -10,8 +10,6 @@ UNET3D: 3D Medical Image Segmentation
 * Dataset: .npz format image files containing a single sample.
 * Trains over multiple epochs, performs evaluation on a held-out test set periodically.
 
-We specify the configuration in ./configs/workload/unet3d.yaml
-
 .. code-block:: yaml
 
     # contents of unet3d.yaml
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 19316379..8993f70e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -2,16 +2,18 @@
 
 Deep Learning I/O Benchmark
 ===============================================================
-Deep Learning I/O (DLIO) Benchmark is a benchmark suite aiming at emulating the I/O pattern / behavior of deep learning applications. The benchmark is delivered as an executable that can be configured for various deep learning specific I/O patterns. It uses a modular design to incorporate different data loaders, data formats, dataset organizations, and training configuration parameters to make it able to represent a broad spectrum of applications. The code is composed of four modules: **Benchmark Runner**, **Data Generator**, **Format Handler**, and **I/O Profiler**. 
+Deep Learning I/O (`DLIO`) Benchmark is a benchmark suite aiming at emulating the I/O pattern / behavior of deep learning applications. The benchmark is delivered as an executable that can be configured for various deep learning specific I/O patterns. It uses a modular design to incorporate different data loaders, data formats, dataset organizations, and training configuration parameters and is able to represent a broad spectrum of deep leanrning applications. 
 
-The main features of DLIO include: 
-   * Easy-to-use configuration through YAML config files to represent the I/O behavior of different deep learing applications.
-   * Easy-to-use data generator to generate synthetic datasets of different formats, different data organizations and layouts. 
+The main features of `DLIO` include: 
+   * Easy-to-use configuration through YAML files which represent the I/O behavior of different deep learing applications.
+   * Easy-to-use data generator capable to generate synthetic datasets of different formats, different data organizations and layouts. 
    * Full transparency over emulation of I/O access with logging and profiling at different levels with modern profilers such as Tensorboard, Torch profiler, darshan and iostat, etc. 
-   * Supporting emulating both sequential and distributed data parallel training. 
+   * Supporting emulating both sequential training and distributed data parallel training. 
 
 GitHub repo: https://github.com/argonne-lcf/dlio_benchmark. 
 
+==================================
+
 .. toctree::
    :maxdepth: 2
    :caption: Overview
diff --git a/docs/source/overview.rst b/docs/source/overview.rst
index 44416a4e..0fb61e13 100644
--- a/docs/source/overview.rst
+++ b/docs/source/overview.rst
@@ -2,25 +2,37 @@ Introduction
 =============
 Deep learning has been shown as a successful
 method for various tasks, and its popularity results in numerous
-open-source deep learning software tools. Deep learning has
+open-source deep learning software tools. It has
 been applied to a broad spectrum of scientific domains such
 as cosmology, particle physics, computer vision, fusion, and
-astrophysics. Scientists have performed a great deal of work
-to optimize the computational performance of deep learning
-frameworks. However, the same cannot be said for I/O performance. As deep learning algorithms rely on big-data volume and
+astrophysics. As deep learning algorithms rely on big-data volume and
 variety to effectively train neural networks accurately, I/O is
 a significant bottleneck on large-scale distributed deep learning training. 
 
-The DLIO benchmark aims to provide a detailed representation of
-the I/O behavior of deep learning workloads. DLIO can be utilized to accurately emulate the I/O behavior of modern deep learning
-applications. Using DLIO, application developers and system
+The `DLIO` benchmark aims to provide a detailed representation of
+the data access pattern of deep learning workloads, to 
+to accurately emulate the I/O behavior in the training process. 
+Using `DLIO`, application developers and system
 software solution architects can identify potential I/O bottlenecks
 in their applications and guide optimizations to boost the I/O
-performance. The storage vendors can also use DLIO benchmark as a guidance for storage and file system design. 
-
-In developing the benchmark, we have the following two considerations: 
-
-First, we assume that one can replace the computation part (training and validation) with a sleep of the same amount of time, while keeping the I/O pattern / behavior the same. The logic behind this is demonstrated as shown in the figure. In a typical deep leanring training process, a batch of data is loaded from the storage to host memory at each time step, and then transfered to the accelerator to perform the training. There might be some hardware supporting loading data from storage directly to the accelerators such as GPU Direct. In either case, the I/O (data access in the storage) should be independent of what is going on inside the accelerator, as long as the frequency of the I/O requests remains the same. 
+performance. The storage vendors can also use `DLIO` benchmark as 
+a guidance for designing storage and file system 
+targeting the deep learning applications. 
+
+In developing the benchmark, we have the following two assume: 
+
+First, we assume that one can replace the computation part 
+(training and validation) with a sleep of the same amount of time, 
+while keeping the I/O pattern / behavior the same. 
+The logic behind this is demonstrated as shown in the figure. 
+In a typical deep leanring training process, a batch of data is 
+loaded from the storage to host memory at each time step, 
+and then transfered to the accelerator to perform the training. 
+There might be some hardware supporting loading data from storage 
+directly to the accelerators such as GPU Direct. In either case, 
+the I/O (data access in the storage) should likely be independent of 
+what is going on inside the accelerator, as long as the 
+frequency of the I/O requests remains the same. 
 
   .. figure:: ./images/training.png
 
@@ -41,35 +53,35 @@ more data formats, datasets, and configuration parameters. It
 emulates deep learning applications using
 **Benchmark Runner**, **Data Generator**, **Format Handler**, and **I/O Profiler** modules. These modules utilize state-of-the-art design
 patterns to build a transparent and extensible framework. The
-DLIO benchmark has been designed with the following goals.
+`DLIO` benchmark has been designed with the following goals.
 
-1) **Accurate**: DLIO should be an accurate representation of
+1) **Accurate**: `DLIO` should be an accurate representation of
 selected deep learning applications. It should
 incorporate all the I/O behavior seen in various configurations of applications, and act as a mini-application that can precisely replay the I/O behavior. 
 
-2) **Configurable**: DLIO should be easily configurable for
+2) **Configurable**: `DLIO` should be easily configurable for
 different scenarios required by the user. These include
 features such as different ratio-of-computation to I/O, multi
 threading for I/O, data operators (e.g., decoding, shuffling,
 prefetch, and batching), and mechanism to feed data into training.
 
-3) **Extensible**: DLIO benchmark should allow adding
+3) **Extensible**: `DLIO` benchmark should allow adding
 custom data directories and enable easy extensions to the
 benchmark to incorporate different data formats, data loaders or data generation algorithms. These changes should not affect
 the basic benchmark operations.
 
 ''''''''''''''''''''
-DLIO Code Modules
+`DLIO` Code Modules
 ''''''''''''''''''''
-Below shows the modules of the DLIO code. 
+Below shows the modules of the `DLIO` code. 
 
 .. image:: images/dlio.png
 
-* **Configuration Manager**: the user specifies a YAML file which represents the characteristics of a real workload. The configuration manager will load the configuration into DLIO. 
+* **Configuration Manager**: the user specifies a YAML file which represents the characteristics of a real workload. The configuration manager will load the configuration into `DLIO`. 
 
 * **Format Handler**: Format Handler will handle the data read and write for specific data format. 
 
-* **Data Generator**: this is for generating synthetic datasets. This eliminates the dependence on real dataset which is typically difficult to get. DLIO can generate synthetic data in different formats, different organization and layouts on the storage, such as: 
+* **Data Generator**: this is for generating synthetic datasets. This eliminates the dependence on real dataset which is typically difficult to get. `DLIO` can generate synthetic data in different formats, different organization and layouts on the storage, such as: 
 
   * Single shared file in which the entire datasets is stored in one file. 
   * One samples per file
diff --git a/src/data_generator/data_generator.py b/src/data_generator/data_generator.py
index 2d5cba81..3ad7d3c2 100644
--- a/src/data_generator/data_generator.py
+++ b/src/data_generator/data_generator.py
@@ -22,6 +22,9 @@
 from mpi4py import MPI
 from shutil import copyfile
 import numpy as np
+import logging
+from src.utils.utility import utcnow
+
 
 class DataGenerator(ABC):
 
@@ -57,6 +60,11 @@ def generate(self):
             if self.num_subfolders_eval > 1: 
                 for i in range(self.num_subfolders_train):
                     os.makedirs(self.data_dir + "/valid/%d"%i, exist_ok=True)
+            logging.info(f"{utcnow()} Generating dataset in {self.data_dir}/train and {self.data_dir}/valid")
+            logging.info(f"{utcnow()} Number of files for training dataset: {self.num_files_train}")
+            logging.info(f"{utcnow()} Number of files for validation dataset: {self.num_files_eval}")
+
+
         MPI.COMM_WORLD.barrier()
         # What is the logic behind this formula? 
         # Will probably have to adapt to generate non-images
diff --git a/src/dlio_benchmark.py b/src/dlio_benchmark.py
index 6453ea5a..77a7f9bd 100644
--- a/src/dlio_benchmark.py
+++ b/src/dlio_benchmark.py
@@ -59,12 +59,32 @@ def __init__(self, cfg):
         </ul>
         """
         self.args = ConfigArguments.get_instance()
+        
+
         LoadConfig(self.args, cfg)
+
         try:
             hydra_cfg = hydra.core.hydra_config.HydraConfig.get()
             self.args.output_folder = hydra_cfg['runtime']['output_dir']
         except:
             self.args.output_folder = 'output/'
+
+        self.output_folder = self.args.output_folder
+        
+        self.logfile = os.path.join(self.output_folder, self.args.log_file)
+
+        # Configure the logging library
+        log_level = logging.DEBUG if self.args.debug else logging.INFO
+        logging.basicConfig(
+            level=log_level,
+            handlers=[
+                logging.FileHandler(self.logfile, mode = "a", encoding='utf-8'),
+                logging.StreamHandler()
+            ],
+            format='%(message)s [%(pathname)s:%(lineno)d]'  # logging's max timestamp resolution is msecs, we will pass in usecs in the message
+        )
+        
+        
         self.logdir = self.args.logdir
         self.data_folder = self.args.data_folder
         self.output_folder = self.args.output_folder
@@ -77,25 +97,19 @@ def __init__(self, cfg):
         self.my_rank = self.args.my_rank = self.framework.rank()
         self.comm_size = self.args.comm_size = self.framework.size()
         self.framework.init_reader(self.args.format, self.args.data_loader)
-        self.logfile = os.path.join(self.output_folder, self.args.log_file)
+
+        if self.args.my_rank==0:
+            logging.info(f"{utcnow()} Running DLIO with {self.args.comm_size} processes")
+            try:
+                logging.info(f"{utcnow()} Reading YAML config file './configs/workload/{hydra_cfg.runtime.choices.workload}.yaml'" )
+            except:
+                pass
 
         # Delete previous logfile
         if self.my_rank == 0:
             if os.path.isfile(self.logfile):
                 os.remove(self.logfile)
 
-        # Configure the logging library
-        log_level = logging.DEBUG if self.args.debug else logging.INFO
-        logging.basicConfig(
-            level=log_level,
-            handlers=[
-                logging.FileHandler(self.logfile, mode = "a", encoding='utf-8'),
-                logging.StreamHandler()
-            ],
-            format='%(message)s [%(pathname)s:%(lineno)d]'  # logging's max timestamp resolution is msecs, we will pass in usecs in the message
-        )
-        logging.info(f"{utcnow()} Running DLIO with {self.comm_size} processes")
-
         self.generate_only = self.args.generate_only
         self.do_profiling = self.args.do_profiling
 
diff --git a/tests/configs/config.yaml b/tests/configs/config.yaml
deleted file mode 100644
index bac61195..00000000
--- a/tests/configs/config.yaml
+++ /dev/null
@@ -1,9 +0,0 @@
-# A set of configuration
-defaults:
- - _self_
- - workload: unet3d_test
- - override hydra/job_logging: disabled  
- - override hydra/hydra_logging: disabled
-hydra:
-  run:
-    dir: ./hydra_log/${workload.model}/${now:%Y-%m-%d}-${now:%H-%M-%S}
\ No newline at end of file
diff --git a/tests/configs/workload/bert_test.yaml b/tests/configs/workload/bert_test.yaml
deleted file mode 100644
index ccb3b524..00000000
--- a/tests/configs/workload/bert_test.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-model: bert_test
-
-framework: tensorflow
-
-workflow:
- generate_data: False
- train: True
- debug: False
- checkpoint: True
-
-dataset: 
- data_folder: ./data/bert/
- format: tfrecord
- num_files_train: 10
- num_samples_per_file: 313
- record-length: 2500
- batch_size: 7
-
-train:
- computation_time: 0.968
-
-data_reader:
- data_loader: tensorflow
- read_threads: 1
- computation_threads: 8
- transfer_size: 262144
-
-checkpoint:
- steps_between_checkpoints: 100
- model_size: 40347133
diff --git a/tests/configs/workload/unet3d_test.yaml b/tests/configs/workload/unet3d_test.yaml
deleted file mode 100644
index 83b17df4..00000000
--- a/tests/configs/workload/unet3d_test.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-model: unet3d
-
-framework: pytorch
-
-workflow:
-  generate_data: False
-  train: True
-  evaluation: True
-
-dataset: 
-  data_folder: ./data/unet3d/
-  format: npz
-  num_files_train: 300
-  num_files_eval: 20
-  num_samples_per_file: 1
-  batch_size: 4
-  batch_size_eval: 1
-  file_access: multi
-  record_length: 1145359
-  keep_files: True
-
-data_reader: 
-  data_loader: pytorch
-
-train:
-  epochs: 3
-  computation_time: 0.59
-
-evaluation: 
-  eval_time: 1.572
-  eval_after_epoch: 1
-  epochs_between_evals: 1
diff --git a/tests/dlio_benchmark_test.py b/tests/dlio_benchmark_test.py
index 8eb404d0..f82731b0 100644
--- a/tests/dlio_benchmark_test.py
+++ b/tests/dlio_benchmark_test.py
@@ -16,18 +16,23 @@
 import glob
 class TestDLIOBenchmark(unittest.TestCase):
     def test_step0_gen_data(self) -> None:
-        with initialize(version_base=None, config_path="./configs"):
+        with initialize(version_base=None, config_path="../configs"):
             cfg = compose(config_name='config', overrides=['++workload.workflow.train=False', '++workload.workflow.generate_data=True'])
             
             benchmark = DLIOBenchmark(cfg['workload'])
             benchmark.initialize()
             benchmark.run()
             benchmark.finalize()
-            assert(len(glob.glob(cfg.workload.dataset.data_folder + "train/*.npz"))==cfg.workload.dataset.num_files_train)
-            assert(len(glob.glob(cfg.workload.dataset.data_folder + "valid/*.npz"))==cfg.workload.dataset.num_files_eval)
+            if benchmark.args.num_subfolders_train<=1:
+                assert(len(glob.glob(cfg.workload.dataset.data_folder + "train/*.npz"))==cfg.workload.dataset.num_files_train)
+                assert(len(glob.glob(cfg.workload.dataset.data_folder + "valid/*.npz"))==cfg.workload.dataset.num_files_eval)
+            else:
+                assert(len(glob.glob(cfg.workload.dataset.data_folder + "train/*/*.npz"))==cfg.workload.dataset.num_files_train)
+                assert(len(glob.glob(cfg.workload.dataset.data_folder + "valid/*/*.npz"))==cfg.workload.dataset.num_files_eval)
+                
         return 0
     def test_step1_train(self) -> None:
-        with initialize(version_base=None, config_path="./configs"):
+        with initialize(version_base=None, config_path="../configs"):
             cfg = compose(config_name='config', overrides=['++workload.workflow.train=True', '++workload.workflow.generate_data=False', 'workload.train.computation_time=0.01', 'workload.evaluation.eval_time=0.005', 'workload.train.epochs=1'])
             benchmark = DLIOBenchmark(cfg['workload'])
             benchmark.initialize()