Skip to content

Commit

Permalink
added excluding start and end steps
Browse files Browse the repository at this point in the history
  • Loading branch information
zhenghh04 committed Feb 14, 2025
1 parent 5d4eb59 commit 0e0b5a5
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 17 deletions.
24 changes: 13 additions & 11 deletions dlio_benchmark/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,16 +250,8 @@ def _train(self, epoch):
loader = self.framework.get_loader(dataset_type=DatasetType.TRAIN)
self.stats.start_loading()
for batch in loader.next():
if overall_step > max_steps or ((self.total_training_steps > 0) and (overall_step > self.total_training_steps)):
if self.args.my_rank == 0:
logging.info(f"{utcnow()} Maximum number of steps reached")
if (block_step != 1 and self.do_checkpoint) or (not self.do_checkpoint):
self.stats.end_block(epoch, block, block_step - 1)
break
self.stats.batch_loaded(epoch, overall_step, block)
# Log a new block, unless it's the first one which we've already logged before the loop
if block_step == 1 and block != 1:
self.stats.start_block(epoch, block)

computation_time = self.args.computation_time
if (isinstance(computation_time, dict) and len(computation_time) > 0) or (isinstance(computation_time, float) and computation_time > 0):
self.framework.trace_object("Train", overall_step, 1)
Expand All @@ -280,11 +272,21 @@ def _train(self, epoch):
else:
block_step += 1
overall_step += 1
if overall_step > max_steps or ((self.total_training_steps > 0) and (overall_step > self.total_training_steps)):
if self.args.my_rank == 0:
logging.info(f"{utcnow()} Maximum number of steps reached")
if (not self.do_checkpoint):
self.stats.end_block(epoch, block, block_step - 1)
break
# start a new block here
if block_step == 1 and block != 1:
self.stats.start_block(epoch, block)
self.stats.start_loading()

self.comm.barrier()
if self.do_checkpoint and (self.steps_between_checkpoints < 0) and (epoch == self.next_checkpoint_epoch):
self.stats.end_block(epoch, block, block_step)
self.stats.start_ckpt(epoch, block, overall_step)
self.stats.end_block(epoch, block, block_step-1)
self.stats.start_ckpt(epoch, block, overall_step-1)
self.checkpointing_mechanism.checkpoint(epoch, overall_step)
self.stats.end_ckpt(epoch, block)
self.next_checkpoint_epoch += self.epochs_between_checkpoints
Expand Down
7 changes: 7 additions & 0 deletions dlio_benchmark/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class ConfigArguments:
generate_only: bool = False
data_folder: str = "./data/"
output_folder: str = None
metric_exclude_start_steps: int = 1
metric_exclude_end_steps: int = 0
checkpoint_folder: str = "./checkpoints/"
log_file: str = "dlio.log"
file_prefix: str = "img"
Expand Down Expand Up @@ -610,6 +612,11 @@ def LoadConfig(args, config):
args.output_folder = config['output']['folder']
if 'log_file' in config['output']:
args.log_file = config['output']['log_file']
if 'metric' in config['output']:
if 'exclude_start_steps' in config['output']['metric']:
args.metric_exclude_start_steps = int(config['output']['metric']['exclude_start_steps'])
if 'exclude_end_steps' in config['output']['metric']:
args.metric_exclude_end_steps = int(config['output']['metric']['exclude_end_steps'])

if args.output_folder is None:
try:
Expand Down
19 changes: 13 additions & 6 deletions dlio_benchmark/utils/statscounter.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,15 @@ def __init__(self):
else:
self.steps_override = False
self.steps = max_steps

self.metric_steps = self.steps - (self.args.metric_exclude_end_steps + self.args.metric_exclude_start_steps)
self.metric_start_step = self.args.metric_exclude_start_steps
self.metric_end_step = self.steps - 1 - self.args.metric_exclude_end_steps
if self.comm.rank == 0:
logging.info(f"{utcnow()} Metric calculation will exclude the beginning {self.args.metric_exclude_start_steps} and end {self.args.metric_exclude_end_steps} steps, only includes {self.metric_steps} steps.")
self.steps_eval = math.floor(self.args.num_samples_per_file * self.args.num_files_eval / self.args.batch_size_eval / self.args.comm_size)
self.metric_steps_eval = self.steps_eval - (self.args.metric_exclude_end_steps + self.args.metric_exclude_start_steps)
self.metric_start_step_eval = self.args.metric_exclude_start_steps
self.metric_end_step_eval = self.steps_eval - 1 - self.args.metric_exclude_end_steps
# Only the root process keeps track of overall stats
if self.my_rank == 0:
self.per_epoch_stats = {}
Expand Down Expand Up @@ -282,7 +289,7 @@ def end_block(self, epoch, block, steps_taken):
self.per_epoch_stats[epoch][f'block{block}']['duration'] = duration
logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Accelerator Utilization [AU] (%): {self.output[epoch]['au'][f'block{block}']:.4f}")
logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Throughput (samples/second): {self.output[epoch]['throughput'][f'block{block}']*self.comm_size:.4f}")
logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Computation time per step (second): {np.mean(self.output[epoch]['compute'][f'block{block}'][1:-1]):.4f}+/-{np.std(self.output[epoch]['compute'][f'block{block}'][1:-1]):.4f} (set value: {self.args.computation_time})")
logging.info(f"{utcnow()} Epoch {epoch} - Block {block} [Training] Computation time per step (second): {np.mean(self.output[epoch]['compute'][f'block{block}'][self.metric_start_step:self.metric_end_step+1]):.4f}+/-{np.std(self.output[epoch]['compute'][f'block{block}'][self.metric_start_step:self.metric_end_step+1]):.4f} (set value: {self.args.computation_time})")

def start_ckpt(self, epoch, block, steps_taken):
if self.my_rank == 0:
Expand Down Expand Up @@ -330,8 +337,8 @@ def batch_processed(self, epoch, step, block):

def compute_metrics_train(self, epoch, block):
key = f"block{block}"
total_compute_time = np.sum(self.output[epoch]['compute'][key][1:-1])
total_time = self.end_timestamp - self.start_timestamp - self.output[epoch]['proc'][key][0] - self.output[epoch]['proc'][key][-1]
total_compute_time = np.sum(self.output[epoch]['compute'][key][self.metric_start_step:self.metric_end_step+1])
total_time = self.end_timestamp - self.start_timestamp - np.sum(self.output[epoch]['proc'][key][:self.metric_start_step]) - np.sum(self.output[epoch]['proc'][key][self.metric_end_step+1:])
if (total_compute_time==0):
au=0.0
else:
Expand All @@ -342,11 +349,11 @@ def compute_metrics_train(self, epoch, block):

def compute_metrics_eval(self, epoch):
key = 'eval'
total_compute_time = np.sum(self.output[epoch]['compute'][key][1:-1])
total_compute_time = np.sum(self.output[epoch]['compute'][key][self.metric_start_step_eval:self.metric_end_step_eval+1])
if (total_compute_time==0):
au=0.0
else:
total_time = self.end_timestamp - self.start_timestamp - self.output[epoch]['proc'][key][0]
total_time = self.end_timestamp - self.start_timestamp - np.sum(self.output[epoch]['proc'][key][:self.metric_start_step_eval]) - np.sum(self.output[epoch]['proc'][key][self.metric_end_step_eval+1:])
au = total_compute_time / total_time
throughput = len(self.output[epoch]['compute'][key])/(self.end_timestamp - self.start_timestamp)*self.batch_size_eval
self.output[epoch]['au'][key] = au*100
Expand Down
4 changes: 4 additions & 0 deletions docs/source/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,10 @@ output
* - log_file
- dlio.log
- log file name
* - metric
- {exclude_start_steps: 1, exclude_end_steps: 0}
- To specify the steps to be excluded in the metric calculation. By default, we exclude the first step in
the beginning.

.. note::

Expand Down

0 comments on commit 0e0b5a5

Please sign in to comment.