Skip to content

Commit

Permalink
Merge branch 'main' into metric
Browse files Browse the repository at this point in the history
  • Loading branch information
zhenghh04 authored Jun 11, 2024
2 parents 75bdee2 + 01283ab commit 03f1e4b
Show file tree
Hide file tree
Showing 39 changed files with 124 additions and 60 deletions.
2 changes: 1 addition & 1 deletion dlio_benchmark/checkpointing/pytorch_checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import torch

from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

from dlio_benchmark.common.constants import MODULE_CHECKPOINT
from dlio_benchmark.common.enumerations import CheckpointLocationType
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/checkpointing/tf_checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import os

from dlio_benchmark.checkpointing.base_checkpointing import BaseCheckpointing
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
import tensorflow as tf

from dlio_benchmark.common.constants import MODULE_CHECKPOINT
Expand Down
6 changes: 6 additions & 0 deletions dlio_benchmark/configs/workload/cosmoflow_a100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@ reader:
data_loader: tensorflow
read_threads: 4
batch_size: 1
file_shuffle: seed
sample_shuffle: seed
shuffle_size: 2

train:
epochs: 5
computation_time: 0.00551

metric:
au: 0.70
8 changes: 7 additions & 1 deletion dlio_benchmark/configs/workload/cosmoflow_h100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@ reader:
data_loader: tensorflow
read_threads: 4
batch_size: 1

file_shuffle: seed
sample_shuffle: seed
shuffle_size: 2

train:
epochs: 5
computation_time: 0.00350

metric:
au: 0.70
5 changes: 4 additions & 1 deletion dlio_benchmark/configs/workload/resnet50_a100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ reader:
read_threads: 8
computation_threads: 8
batch_size: 400
dont_use_mmap: True
dont_use_mmap: True

metric:
au: 0.90
5 changes: 4 additions & 1 deletion dlio_benchmark/configs/workload/resnet50_h100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ reader:
read_threads: 8
computation_threads: 8
batch_size: 400
dont_use_mmap: True


metric:
au: 0.90
3 changes: 3 additions & 0 deletions dlio_benchmark/configs/workload/unet3d_a100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ checkpoint:
checkpoint_after_epoch: 5
epochs_between_checkpoints: 2
model_size: 499153191

metric:
au: 0.90
3 changes: 3 additions & 0 deletions dlio_benchmark/configs/workload/unet3d_h100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ checkpoint:
checkpoint_after_epoch: 5
epochs_between_checkpoints: 2
model_size: 499153191

metric:
au: 0.90
2 changes: 1 addition & 1 deletion dlio_benchmark/data_generator/hdf5_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from dlio_benchmark.common.enumerations import Compression
from dlio_benchmark.data_generator.data_generator import DataGenerator
from dlio_benchmark.utils.utility import progress
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from shutil import copyfile

from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_generator/indexed_binary_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import numpy as np

from dlio_benchmark.utils.utility import progress, utcnow, DLIOMPI
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from shutil import copyfile
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
import struct
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_generator/jpeg_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import numpy as np

from dlio_benchmark.utils.utility import progress, utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from shutil import copyfile
import PIL.Image as im
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_generator/npy_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import numpy as np

from dlio_benchmark.utils.utility import progress, utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from shutil import copyfile
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_generator/npz_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import numpy as np

from dlio_benchmark.utils.utility import progress, utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from shutil import copyfile
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_generator/png_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import numpy as np

from dlio_benchmark.utils.utility import progress, utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from shutil import copyfile
import PIL.Image as im
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_generator/synthetic_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import numpy as np

from dlio_benchmark.utils.utility import progress, utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from shutil import copyfile
import PIL.Image as im
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_generator/tf_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from dlio_benchmark.data_generator.data_generator import DataGenerator
import numpy as np
import tensorflow as tf
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

from dlio_benchmark.utils.utility import progress, utcnow
from shutil import copyfile
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_loader/dali_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
from dlio_benchmark.reader.reader_factory import ReaderFactory
from dlio_benchmark.utils.utility import utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
import os

dlp = Profile(MODULE_DATA_LOADER)
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_loader/synthetic_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
from dlio_benchmark.reader.reader_factory import ReaderFactory
from dlio_benchmark.utils.utility import utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
import os

dlp = Profile(MODULE_DATA_LOADER)
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_loader/tf_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
from dlio_benchmark.reader.reader_factory import ReaderFactory
from dlio_benchmark.utils.utility import utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

import numpy as np

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/data_loader/torch_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from dlio_benchmark.reader.reader_factory import ReaderFactory
from dlio_benchmark.utils.utility import utcnow, DLIOMPI
from dlio_benchmark.utils.config import ConfigArguments
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

dlp = Profile(MODULE_DATA_LOADER)

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/framework/tf_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from dlio_benchmark.common.constants import MODULE_AI_FRAMEWORK
from dlio_benchmark.data_loader.data_loader_factory import DataLoaderFactory
from dlio_benchmark.utils.utility import utcnow, DLIOMPI
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from dlio_benchmark.common.error_code import ErrorCodes
from dlio_benchmark.framework.framework import Framework
from dlio_benchmark.reader.reader_factory import ReaderFactory
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/framework/torch_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import functools
import logging
from dlio_benchmark.utils.utility import utcnow, DLIOMPI
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

from time import sleep, time

Expand Down
9 changes: 6 additions & 3 deletions dlio_benchmark/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"""
import os
import math
import hydra

import logging
from time import time, sleep
import json
Expand Down Expand Up @@ -46,10 +46,13 @@
from dlio_benchmark.framework.framework_factory import FrameworkFactory
from dlio_benchmark.data_generator.generator_factory import GeneratorFactory
from dlio_benchmark.storage.storage_factory import StorageFactory
from dlio_profiler.logger import dlio_logger as PerfTrace, fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile, PerfTrace

dlp = Profile(MODULE_DLIO_BENCHMARK)

from mpi4py import MPI
# To make sure the output folder is the same in all the nodes. We have to do this.
MPI.COMM_WORLD.Barrier()
import hydra

class DLIOBenchmark(object):
"""
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pandas as pd

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from dlio_benchmark.reader.reader_handler import FormatReader

dlp = Profile(MODULE_DATA_READER)
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/hdf5_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import h5py

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from dlio_benchmark.reader.reader_handler import FormatReader

dlp = Profile(MODULE_DATA_READER)
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/image_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.reader.reader_handler import FormatReader
from dlio_benchmark.utils.utility import utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

dlp = Profile(MODULE_DATA_READER)

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/indexed_binary_mmap_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.common.enumerations import DataLoaderSampler
from dlio_benchmark.reader.reader_handler import FormatReader
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

dlp = Profile(MODULE_DATA_READER)

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/indexed_binary_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.common.enumerations import DataLoaderSampler
from dlio_benchmark.reader.reader_handler import FormatReader
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

dlp = Profile(MODULE_DATA_READER)

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/npy_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.reader.reader_handler import FormatReader
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

dlp = Profile(MODULE_DATA_READER)

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/npz_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.reader.reader_handler import FormatReader
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

dlp = Profile(MODULE_DATA_READER)

Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/reader_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from dlio_benchmark.framework.framework_factory import FrameworkFactory
from dlio_benchmark.storage.storage_factory import StorageFactory
from dlio_benchmark.utils.utility import utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from dlio_benchmark.utils.config import ConfigArguments
import numpy as np
import os
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/reader/synthetic_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.reader.reader_handler import FormatReader
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

dlp = Profile(MODULE_DATA_READER)

Expand Down
44 changes: 21 additions & 23 deletions dlio_benchmark/reader/tf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from dlio_benchmark.common.constants import MODULE_DATA_READER
from dlio_benchmark.utils.utility import utcnow
from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile
from dlio_benchmark.common.enumerations import DatasetType, Shuffle
from dlio_benchmark.reader.reader_handler import FormatReader
import tensorflow as tf
Expand Down Expand Up @@ -81,35 +81,33 @@ def _parse_image(self, serialized):

@dlp.log
def next(self):
logging.debug(
f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}")
self._dataset = tf.data.TFRecordDataset(filenames=self._file_list, buffer_size=self._args.transfer_size,
logging.debug(f"{utcnow()} Reading {len(self._file_list)} files thread {self.thread_index} rank {self._args.my_rank}")
filenames = tf.data.Dataset.list_files(self._file_list, shuffle=True)
# sharding in the file list if we have enought files.
if (len(self._file_list) >= self._args.comm_size):
filenames = filenames.shard(num_shards=self._args.comm_size, index=self._args.my_rank)

self._dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=self._args.transfer_size,
num_parallel_reads=self._args.read_threads)

if self._args.sample_shuffle != Shuffle.OFF:
if self._args.sample_shuffle == Shuffle.SEED:
self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size,
seed=self._args.seed)
else:
self._dataset = self._dataset.shuffle(buffer_size=self._args.shuffle_size)

self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank)
if self._args.computation_threads==0:
self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True)
else:
if self._args.computation_threads <= self.batch_size:
self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True)
self._dataset = self._dataset.map(
lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]),
num_parallel_calls=self._args.computation_threads)
else:
self._dataset = self._dataset.batch(self._args.computation_threads)
self._dataset = self._dataset.map(
lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]),
num_parallel_calls=self._args.computation_threads)
self._dataset = self._dataset.unbatch(self.batch_size)
self._dataset = self._dataset.repeat(self._args.epochs)
total = math.ceil(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file)

# shard the dataset if it is not done already.
if (len(self._file_list) < self._args.comm_size):
self._dataset = self._dataset.shard(num_shards=self._args.comm_size, index=self._args.my_rank)

self._dataset = self._dataset.batch(self.batch_size, drop_remainder=True)
self._dataset = self._dataset.map(
lambda x: tf.py_function(func=self._parse_image, inp=[x], Tout=[tf.uint8]),
num_parallel_calls=self._args.computation_threads)

self._dataset = self._dataset.repeat()
total = math.floor(len(self._file_list)/self._args.comm_size / self.batch_size * self._args.num_samples_per_file)
return self._dataset.take(total*self._args.epochs).prefetch(buffer_size=self._args.prefetch_size)

@dlp.log
Expand Down
2 changes: 1 addition & 1 deletion dlio_benchmark/storage/file_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import glob
import shutil

from dlio_profiler.logger import fn_interceptor as Profile
from dlio_benchmark.utils.utility import Profile

dlp = Profile(MODULE_STORAGE)

Expand Down
Loading

0 comments on commit 03f1e4b

Please sign in to comment.