Skip to content

Commit

Permalink
Making DLIO Profiler default for dlio_benchmark. (#111)
Browse files Browse the repository at this point in the history
* making dlp default.

* make dlp version to 0.0.2

* Update utility.py

Unhidden the trace file

---------

Co-authored-by: Huihuo Zheng <[email protected]>
  • Loading branch information
hariharan-devarajan and zhenghh04 authored Nov 20, 2023
1 parent 8dd1223 commit 035f512
Show file tree
Hide file tree
Showing 26 changed files with 100 additions and 261 deletions.
15 changes: 4 additions & 11 deletions .github/workflows/python-package-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ jobs:
fail-fast: false
matrix:
os: [ ubuntu-20.04, ubuntu-22.04 ]
profiler: [ DEFAULT, DLIO_PROFILER ]
profiler: [ 0, 1 ]
gcc: [10]
python: ["3.8", "3.9", "3.10" ]
name: ${{ matrix.os }}-${{ matrix.profiler }}-${{ matrix.gcc }}-${{ matrix.python }}
runs-on: ${{ matrix.os }}
env:
VENV: "/home/runner/work/venv"
DLIO_PROFILER: ${{ matrix.profiler }}
DLIO_PROFILER_ENABLE: ${{ matrix.profiler }}
CC: gcc-${{ matrix.gcc }}
CXX: g++-${{ matrix.gcc }}
RDMAV_FORK_SAFE: "1"
Expand Down Expand Up @@ -48,10 +48,7 @@ jobs:
run: |
sudo apt update
sudo apt-get install $CC $CXX libc6
sudo apt-get install mpich
if [[ $DLIO_PROFILER == 'DLIO_PROFILER' ]]; then
sudo apt-get install libhwloc-dev
fi
sudo apt-get install mpich libhwloc-dev
- name: Install DLIO
if: steps.cache-modules.outputs.cache-hit != 'true'
run: |
Expand All @@ -60,11 +57,7 @@ jobs:
pip install virtualenv
python -m venv ${VENV}
source ${VENV}/bin/activate
if [[ $DLIO_PROFILER == 'DLIO_PROFILER' ]]; then
pip install .[test,dlio_profiler]
else
pip install .[test]
fi
pip install .[test]
rm -rf dlio_benchmark
- name: test_gen_data
run: |
Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/data_generator/hdf5_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@

from dlio_benchmark.common.enumerations import Compression
from dlio_benchmark.data_generator.data_generator import DataGenerator
from dlio_benchmark.utils.utility import progress, Profile
from dlio_benchmark.utils.utility import progress
from dlio_profiler.logger import fn_interceptor as Profile
from shutil import copyfile

from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/data_generator/jpeg_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
import logging
import numpy as np

from dlio_benchmark.utils.utility import progress, utcnow, Profile
from dlio_benchmark.utils.utility import progress, utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from shutil import copyfile
import PIL.Image as im
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/data_generator/npz_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
import logging
import numpy as np

from dlio_benchmark.utils.utility import progress, utcnow, Profile
from dlio_benchmark.utils.utility import progress, utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from shutil import copyfile
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR

Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/data_generator/png_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
import logging
import numpy as np

from dlio_benchmark.utils.utility import progress, utcnow, Profile
from dlio_benchmark.utils.utility import progress, utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from shutil import copyfile
import PIL.Image as im
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/data_generator/tf_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
import numpy as np
import tensorflow as tf

from dlio_benchmark.utils.utility import progress, utcnow, Profile
from dlio_benchmark.utils.utility import progress, utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from shutil import copyfile
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR

Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/data_loader/dali_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
from dlio_benchmark.common.enumerations import Shuffle, DataLoaderType, DatasetType
from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
from dlio_benchmark.reader.reader_factory import ReaderFactory
from dlio_benchmark.utils.utility import utcnow, get_rank, timeit, Profile
from dlio_benchmark.utils.utility import utcnow, get_rank, timeit
from dlio_profiler.logger import fn_interceptor as Profile

dlp = Profile(MODULE_DATA_LOADER)

Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/data_loader/tf_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from dlio_benchmark.common.enumerations import DataLoaderType, Shuffle, FormatType, DatasetType
from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
from dlio_benchmark.reader.reader_factory import ReaderFactory
from dlio_benchmark.utils.utility import utcnow, Profile
from dlio_benchmark.utils.utility import utcnow
from dlio_profiler.logger import fn_interceptor as Profile

import numpy as np

Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/data_loader/torch_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
from dlio_benchmark.common.enumerations import Shuffle, DatasetType, DataLoaderType
from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
from dlio_benchmark.reader.reader_factory import ReaderFactory
from dlio_benchmark.utils.utility import utcnow, get_rank, Profile
from dlio_benchmark.utils.utility import utcnow, get_rank
from dlio_profiler.logger import fn_interceptor as Profile

dlp = Profile(MODULE_DATA_LOADER)

Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/framework/tf_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@

from dlio_benchmark.common.constants import MODULE_AI_FRAMEWORK
from dlio_benchmark.data_loader.data_loader_factory import DataLoaderFactory
from dlio_benchmark.utils.utility import utcnow, Profile
from dlio_benchmark.utils.utility import utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.common.error_code import ErrorCodes
from dlio_benchmark.framework.framework import Framework
from dlio_benchmark.reader.reader_factory import ReaderFactory
Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/framework/torch_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
import torch
import functools
import logging
from dlio_benchmark.utils.utility import utcnow, PerfTrace, Profile
from dlio_benchmark.utils.utility import utcnow
from dlio_profiler.logger import fn_interceptor as Profile

from time import sleep, time

Expand Down
86 changes: 53 additions & 33 deletions dlio_benchmark/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import logging
import pandas as pd
from time import time, sleep
import json
import json
import numpy as np

# Reduce TF and CUDA logging
Expand All @@ -36,7 +36,7 @@
warnings.filterwarnings("ignore", category=UserWarning)

from dataclasses import dataclass
from dlio_benchmark.utils.utility import utcnow, measure_performance, PerfTrace, Profile
from dlio_benchmark.utils.utility import utcnow, measure_performance, get_trace_name, get_rank
from omegaconf import DictConfig, OmegaConf
from dlio_benchmark.utils.statscounter import StatsCounter
from hydra.core.config_store import ConfigStore
Expand All @@ -46,7 +46,11 @@
from dlio_benchmark.framework.framework_factory import FrameworkFactory
from dlio_benchmark.data_generator.generator_factory import GeneratorFactory
from dlio_benchmark.storage.storage_factory import StorageFactory
from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile

dlp = Profile(MODULE_DLIO_BENCHMARK)


class DLIOBenchmark(object):
"""
The Benchmark represents the I/O behavior of deep learning applications.
Expand All @@ -72,12 +76,31 @@ def __init__(self, cfg):
except:
self.args.output_folder = 'output/'
self.output_folder = self.args.output_folder
PerfTrace.initialize_log(self.args.output_folder, f"{os.path.abspath(self.args.data_folder)}:{self.args.data_folder}:./{self.args.data_folder}")
with Profile(name=f"{self.__init__.__qualname__}", cat=MODULE_DLIO_BENCHMARK):
self.storage = StorageFactory().get_storage(self.args.storage_type, self.args.storage_root, self.args.framework)
self.storage = StorageFactory().get_storage(self.args.storage_type, self.args.storage_root,
self.args.framework)

self.output_folder = self.args.output_folder
self.logfile = os.path.join(self.output_folder, self.args.log_file)
self.output_folder = self.args.output_folder
self.output = StorageFactory().get_storage(self.args.storage_type, self.args.output_folder,
self.args.framework)
self.output.create_namespace(exist_ok=True)
self.logfile = os.path.join(self.output_folder, self.args.log_file)
# Configure the logging library
log_level = logging.DEBUG if self.args.debug else logging.INFO
logging.basicConfig(
level=log_level,
handlers=[
logging.FileHandler(self.logfile, mode="a", encoding='utf-8'),
logging.StreamHandler()
],
format='[%(levelname)s] %(message)s [%(pathname)s:%(lineno)d]'
# logging's max timestamp resolution is msecs, we will pass in usecs in the message
)
dlp_trace = get_trace_name(self.args.output_folder)
logging.info(f"{utcnow()} Profiling DLIO {dlp_trace}")
self.dlp_logger = PerfTrace.initialize_log(logfile=dlp_trace,
data_dir=f"{os.path.abspath(self.args.data_folder)}:{self.args.data_folder}:./{self.args.data_folder}",
process_id=get_rank())
with Profile(name=f"{self.__init__.__qualname__}", cat=MODULE_DLIO_BENCHMARK):
self.data_folder = self.args.data_folder
self.storage_root = self.args.storage_root
if self.args.storage_root:
Expand All @@ -92,18 +115,6 @@ def __init__(self, cfg):
if os.path.isfile(self.logfile):
os.remove(self.logfile)
self.framework.barrier()
# Configure the logging library
log_level = logging.DEBUG if self.args.debug else logging.INFO
logging.basicConfig(
level=log_level,
handlers=[
logging.FileHandler(self.logfile, mode="a", encoding='utf-8'),
logging.StreamHandler()
],
format='[%(levelname)s] %(message)s [%(pathname)s:%(lineno)d]'
# logging's max timestamp resolution is msecs, we will pass in usecs in the message
)

if self.args.my_rank == 0:
logging.info(f"{utcnow()} Running DLIO with {self.args.comm_size} process(es)")
try:
Expand Down Expand Up @@ -149,6 +160,7 @@ def __init__(self, cfg):
self.eval_after_epoch = self.args.eval_after_epoch
self.epochs_between_evals = self.args.epochs_between_evals
self.stats = StatsCounter()

@dlp.log
def initialize(self):
"""
Expand Down Expand Up @@ -180,40 +192,45 @@ def initialize(self):
file_list_eval = []
num_subfolders = 0
for dataset_type in [DatasetType.TRAIN, DatasetType.VALID]:
if dataset_type==DatasetType.TRAIN:
if dataset_type == DatasetType.TRAIN:
num_subfolders = self.num_subfolders_train
else:
num_subfolders = self.num_subfolders_eval
filenames = self.storage.walk_node(os.path.join(self.args.data_folder, f"{dataset_type}"))
if (len(filenames)==0):
if (len(filenames) == 0):
continue
if self.storage.get_node(
os.path.join(self.args.data_folder, f"{dataset_type}",
filenames[0])) == MetadataType.DIRECTORY:
assert(num_subfolders == len(filenames))
fullpaths = self.storage.walk_node(os.path.join(self.args.data_folder, f"{dataset_type}/*/*.{self.args.format}"),
use_pattern=True)
assert (num_subfolders == len(filenames))
fullpaths = self.storage.walk_node(
os.path.join(self.args.data_folder, f"{dataset_type}/*/*.{self.args.format}"),
use_pattern=True)
files = [self.storage.get_basename(f) for f in fullpaths]
idx = np.argsort(files)
fullpaths = [fullpaths[i] for i in idx]
else:
assert(num_subfolders==0)
assert (num_subfolders == 0)
fullpaths = [self.storage.get_uri(os.path.join(self.args.data_folder, f"{dataset_type}", entry))
for entry in filenames if entry.find(f'{self.args.format}')!=-1]
for entry in filenames if entry.find(f'{self.args.format}') != -1]
fullpaths = sorted(fullpaths)
if dataset_type is DatasetType.TRAIN:
file_list_train = fullpaths
elif dataset_type is DatasetType.VALID:
file_list_eval = fullpaths
if not self.generate_only and self.num_files_train > len(file_list_train):
raise Exception("Not enough training dataset is found; Please run the code with ++workload.workflow.generate_data=True")
raise Exception(
"Not enough training dataset is found; Please run the code with ++workload.workflow.generate_data=True")
if self.do_eval and self.num_files_eval > len(file_list_eval):
raise Exception("Not enough evaluation dataset is found; Please run the code with ++workload.workflow.generate_data=True")
raise Exception(
"Not enough evaluation dataset is found; Please run the code with ++workload.workflow.generate_data=True")
if (self.num_files_train < len(file_list_train)):
logging.warning(f"Number of files for training in {os.path.join(self.args.data_folder, f'{DatasetType.TRAIN}')} ({len(file_list_train)}) is more than requested ({self.num_files_train}). A subset of files will be used ")
logging.warning(
f"Number of files for training in {os.path.join(self.args.data_folder, f'{DatasetType.TRAIN}')} ({len(file_list_train)}) is more than requested ({self.num_files_train}). A subset of files will be used ")
file_list_train = file_list_train[:self.num_files_train]
if (self.num_files_eval < len(file_list_eval)):
logging.warning(f"Number of files for evaluation in {os.path.join(self.args.data_folder, f'{DatasetType.VALID}')} ({len(file_list_eval)}) is more than requested ({self.num_files_eval}). A subset of files will be used ")
logging.warning(
f"Number of files for evaluation in {os.path.join(self.args.data_folder, f'{DatasetType.VALID}')} ({len(file_list_eval)}) is more than requested ({self.num_files_eval}). A subset of files will be used ")
file_list_eval = file_list_eval[:self.num_files_eval]
self.args.derive_configurations(file_list_train, file_list_eval)
self.args.validate()
Expand Down Expand Up @@ -246,6 +263,7 @@ def _eval(self, epoch):
break
t0 = time()
return step - 1

@dlp.log
def _train(self, epoch):
"""
Expand Down Expand Up @@ -334,7 +352,7 @@ def run(self):
self.next_checkpoint_epoch = self.checkpoint_after_epoch

for epoch in range(1, self.epochs + 1):
self.next_checkpoint_step = self.steps_between_checkpoints
self.next_checkpoint_step = self.steps_between_checkpoints
self.stats.start_train(epoch)

# Initialize the dataset
Expand All @@ -352,6 +370,7 @@ def run(self):
self.stats.end_eval(epoch)
self.framework.get_loader(DatasetType.VALID).finalize()
self.stats.end_run()

@dlp.log
def finalize(self):
"""
Expand All @@ -376,7 +395,9 @@ def finalize(self):
# Save collected stats to disk
self.stats.finalize()
self.stats.save_data()
self.framework.barrier()
self.framework.barrier()
self.dlp_logger.finalize()


@hydra.main(version_base=None, config_path="configs", config_name="config")
def main(cfg: DictConfig) -> None:
Expand All @@ -388,7 +409,6 @@ def main(cfg: DictConfig) -> None:
benchmark.initialize()
benchmark.run()
benchmark.finalize()
PerfTrace.get_instance().finalize()


if __name__ == '__main__':
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from dlio_benchmark.common.enumerations import Shuffle, DatasetType, DataLoaderType
from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
from dlio_benchmark.reader.reader_factory import ReaderFactory
from dlio_benchmark.utils.utility import utcnow, get_rank, Profile
from dlio_benchmark.utils.utility import utcnow, get_rank
from dlio_profiler.logger import fn_interceptor as Profile

dlp = Profile(MODULE_DATA_LOADER)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.reader.reader_handler import FormatReader
from dlio_benchmark.utils.utility import Profile

from dlio_profiler.logger import fn_interceptor as Profile

dlp = Profile(MODULE_DATA_READER)

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pandas as pd

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.utils.utility import Profile
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.reader.reader_handler import FormatReader

dlp = Profile(MODULE_DATA_READER)
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/hdf5_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import h5py

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.utils.utility import Profile
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.reader.reader_handler import FormatReader

dlp = Profile(MODULE_DATA_READER)
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/jpeg_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.reader.reader_handler import FormatReader
from dlio_benchmark.utils.utility import Profile
from dlio_profiler.logger import fn_interceptor as Profile

dlp = Profile(MODULE_DATA_READER)

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/npz_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.reader.reader_handler import FormatReader
from dlio_benchmark.utils.utility import Profile
from dlio_profiler.logger import fn_interceptor as Profile

dlp = Profile(MODULE_DATA_READER)

Expand Down
Loading

0 comments on commit 035f512

Please sign in to comment.