You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am getting below error in Multi GPU trianing where it can not find the tfevents file.
trainer.fit(model, data)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 553, in fit
self._run(model)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 918, in _run
self._dispatch()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _dispatch
self.accelerator.start_training(self)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 161, in start_training
self._results = trainer.run_stage()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 996, in run_stage
return self._run_train()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1058, in _run_train
self.training_type_plugin.reconciliate_processes(traceback.format_exc())
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 453, in reconciliate_processes
raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 0
Traceback (most recent call last):
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1045, in _run_train
self.fit_loop.run()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 200, in advance
epoch_output = self.epoch_loop.run(train_dataloader)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 149, in advance
self.trainer.call_hook(
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1217, in call_hook
trainer_hook(*args, **kwargs)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/trainer/callback_hook.py", line 189, in on_train_batch_end
callback.on_train_batch_end(self, self.lightning_module, outputs, batch, batch_idx, dataloader_idx)
File "/home/csgrad/mbhosale/phd/Pathdiff/PathLDM/main.py", line 443, in on_train_batch_end
self.log_img(pl_module, batch, batch_idx, split="train")
File "/home/csgrad/mbhosale/phd/Pathdiff/PathLDM/main.py", line 424, in log_img
logger_log_images(pl_module, images, pl_module.global_step, split)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/pytorch_lightning/utilities/distributed.py", line 48, in wrapped_fn
return fn(*args, **kwargs)
File "/home/csgrad/mbhosale/phd/Pathdiff/PathLDM/main.py", line 363, in _testtube
pl_module.logger.experiment.add_image(tag, grid, global_step=pl_module.global_step)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/torch/utils/tensorboard/writer.py", line 614, in add_image
self._get_file_writer().add_summary(
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/torch/utils/tensorboard/writer.py", line 113, in add_summary
self.add_event(event, global_step, walltime)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/torch/utils/tensorboard/writer.py", line 98, in add_event
self.event_writer.add_event(event)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 117, in add_event
self._async_writer.write(event.SerializeToString())
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 171, in write
self._check_worker_status()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 212, in _check_worker_status
raise exception
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 244, in run
self._run()
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/event_file_writer.py", line 275, in _run
self._record_writer.write(data)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/summary/writer/record_writer.py", line 40, in write
self._writer.write(header + header_crc + data + footer_crc)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 773, in write
self.fs.append(self.filename, file_content, self.binary_mode)
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 167, in append
self._write(filename, file_content, "ab" if binary_mode else "a")
File "/home/csgrad/mbhosale/anaconda3/envs/pathldm1/lib/python3.8/site-packages/tensorboard/compat/tensorflow_stub/io/gfile.py", line 171, in _write
with io.open(filename, mode, encoding=encoding) as f:
FileNotFoundError: [Errno 2] No such file or directory: b'logs/06-03T05-49_plip_imagenet_finetune_PanNuke/testtube/version_0/tf/events.out.tfevents.1717408192.deepbull8.818802.0'
I checked the file, there is no folder names tf under version_0. Interestingly I get this error only when I run it on multiple GPUs, with a single GPU somehow it gets resolved. I have no idea how to resolve or start debugging this issue.
The text was updated successfully, but these errors were encountered:
I am getting below error in Multi GPU trianing where it can not find the tfevents file.
I checked the file, there is no folder names tf under version_0. Interestingly I get this error only when I run it on multiple GPUs, with a single GPU somehow it gets resolved. I have no idea how to resolve or start debugging this issue.
The text was updated successfully, but these errors were encountered: