From f56998247857b06d6b7a96d48b715c738d249026 Mon Sep 17 00:00:00 2001 From: Ray Andrew Date: Wed, 23 Oct 2024 22:40:28 +0000 Subject: [PATCH] fix wrong tracing location of fetch data --- dlio_benchmark/data_loader/native_dali_data_loader.py | 4 +++- dlio_benchmark/data_loader/tf_data_loader.py | 4 +++- dlio_benchmark/data_loader/torch_data_loader.py | 4 +++- dlio_benchmark/main.py | 4 ++-- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/dlio_benchmark/data_loader/native_dali_data_loader.py b/dlio_benchmark/data_loader/native_dali_data_loader.py index 39ea3198..ab8fb064 100644 --- a/dlio_benchmark/data_loader/native_dali_data_loader.py +++ b/dlio_benchmark/data_loader/native_dali_data_loader.py @@ -60,7 +60,9 @@ def next(self): pipeline.reset() for step in range(num_samples // batch_size): try: - for batch in self._dataset: + # TODO: @hariharan-devarajan: change below line when we bump the dftracer version to + # `dlp.iter(self._dataset, name=self.next.__qualname__)` + for batch in dlp.iter(self._dataset): logging.debug(f"{utcnow()} Creating {len(batch)} batches by {self._args.my_rank} rank ") yield batch except StopIteration: diff --git a/dlio_benchmark/data_loader/tf_data_loader.py b/dlio_benchmark/data_loader/tf_data_loader.py index 695c4523..0ec9f34a 100644 --- a/dlio_benchmark/data_loader/tf_data_loader.py +++ b/dlio_benchmark/data_loader/tf_data_loader.py @@ -100,7 +100,9 @@ def read(self): @dlp.log def next(self): super().next() - for batch in self._dataset: + # TODO: @hariharan-devarajan: change below line when we bump the dftracer version to + # `dlp.iter(self._dataset, name=self.next.__qualname__)` + for batch in dlp.iter(self._dataset): yield batch self.epoch_number += 1 dlp.update(epoch=self.epoch_number) diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py index 80212fff..3048dab1 100644 --- a/dlio_benchmark/data_loader/torch_data_loader.py +++ b/dlio_benchmark/data_loader/torch_data_loader.py @@ -166,7 +166,9 @@ def next(self): total = self._args.training_steps if self.dataset_type is DatasetType.TRAIN else self._args.eval_steps logging.debug(f"{utcnow()} Rank {self._args.my_rank} should read {total} batches") step = 1 - for batch in self._dataset: + # TODO: @hariharan-devarajan: change below line when we bump the dftracer version to + # `dlp.iter(self._dataset, name=self.next.__qualname__)` + for batch in dlp.iter(self._dataset): dlp.update(step = step) step += 1 yield batch diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py index 981f9c51..1a312ce1 100644 --- a/dlio_benchmark/main.py +++ b/dlio_benchmark/main.py @@ -224,7 +224,7 @@ def _eval(self, epoch): total = math.floor(self.num_samples * self.num_files_eval / self.batch_size_eval / self.comm_size) loader = self.framework.get_loader(DatasetType.VALID) t0 = time() - for batch in dlp.iter(loader.next()): + for batch in loader.next(): self.stats.eval_batch_loaded(epoch, step, t0) eval_time = 0.0 if self.eval_time > 0: @@ -256,7 +256,7 @@ def _train(self, epoch): loader = self.framework.get_loader(dataset_type=DatasetType.TRAIN) t0 = time() - for batch in dlp.iter(loader.next()): + for batch in loader.next(): if overall_step > max_steps or ((self.total_training_steps > 0) and (overall_step > self.total_training_steps)): if self.args.my_rank == 0: logging.info(f"{utcnow()} Maximum number of steps reached")