You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
So when training all seems fine but during training after a few epochs the training stops and i get an error looking like this:
torch.distributed.elastic.multiprocessing.api.SignalException: Process 22636 got signal: 2
Now the problem clearly is in DDP, but it happens during training what I find very strange. Also it does not always happens as the training has also been completed once for 999 iterations. Now it mostly crashes around 30-80 epochs.
I hav two GPUS and i do see that they are initially both working so the DDP must be setup correctly but then it just stops
import os
from super_gradients import init_trainer
from super_gradients.training import models, Trainer
from super_gradients.common import MultiGPUMode # needed for parallel processing
from super_gradients.training.utils.distributed_training_utils import setup_device
from super_gradients.training.dataloaders.dataloaders import coco_detection_yolo_format_train, coco_detection_yolo_format_val
from super_gradients.training.losses import PPYoloELoss
from super_gradients.training.metrics import DetectionMetrics_050, DetectionMetrics_050_095, DetectionMetrics_075
from super_gradients.training.models.detection_models.pp_yolo_e import PPYoloEPostPredictionCallback
Train configuration files
from configurations import TrainModelConfig, TrainParamConfig
def _train():
# NOTE CHECKPOINT_DIR MUST BE AN ABSOLUTE PATH FOR MODEL LOADING!!
# Get the absolute path of the current script or working directory
project_root = os.path.dirname(os.path.abspath(file))
CHECKPOINT_DIR : str = os.path.join(project_root, TrainModelConfig.checkpoints_dir_name)
CHECKPOINT_DIR_FOLDER : str = os.path.abspath(CHECKPOINT_DIR)
os.makedirs(CHECKPOINT_DIR_FOLDER, exist_ok=True)
def train() -> None: # main entry point to start the training and setup GPU device
# Setup environment trainer
init_trainer()
if TrainModelConfig.multi_gpu_version.upper() == 'DDP':
# Launch DDP on num_gpu GPUs' this should be the go to for parralel processing!
setup_device(device= TrainModelConfig.device, multi_gpu=MultiGPUMode.DISTRIBUTED_DATA_PARALLEL, num_gpus=TrainModelConfig.num_gpus)
elif TrainModelConfig.multi_gpu_version.upper() == 'DP':
# Launch DP on num_gpu GPUs' -> NOTE: not working (yet)
setup_device(multi_gpu=MultiGPUMode.DATA_PARALLEL, num_gpus=TrainModelConfig.num_gpus)
# Call _train()
_train()
if name == 'main':
train()
Versions
No response
The text was updated successfully, but these errors were encountered:
💡 Your Question
So when training all seems fine but during training after a few epochs the training stops and i get an error looking like this:
torch.distributed.elastic.multiprocessing.api.SignalException: Process 22636 got signal: 2
Now the problem clearly is in DDP, but it happens during training what I find very strange. Also it does not always happens as the training has also been completed once for 999 iterations. Now it mostly crashes around 30-80 epochs.
I hav two GPUS and i do see that they are initially both working so the DDP must be setup correctly but then it just stops
import os
from super_gradients import init_trainer
from super_gradients.training import models, Trainer
from super_gradients.common import MultiGPUMode # needed for parallel processing
from super_gradients.training.utils.distributed_training_utils import setup_device
from super_gradients.training.dataloaders.dataloaders import coco_detection_yolo_format_train, coco_detection_yolo_format_val
from super_gradients.training.losses import PPYoloELoss
from super_gradients.training.metrics import DetectionMetrics_050, DetectionMetrics_050_095, DetectionMetrics_075
from super_gradients.training.models.detection_models.pp_yolo_e import PPYoloEPostPredictionCallback
Train configuration files
from configurations import TrainModelConfig, TrainParamConfig
def _train():
# NOTE CHECKPOINT_DIR MUST BE AN ABSOLUTE PATH FOR MODEL LOADING!!
# Get the absolute path of the current script or working directory
project_root = os.path.dirname(os.path.abspath(file))
CHECKPOINT_DIR : str = os.path.join(project_root, TrainModelConfig.checkpoints_dir_name)
CHECKPOINT_DIR_FOLDER : str = os.path.abspath(CHECKPOINT_DIR)
os.makedirs(CHECKPOINT_DIR_FOLDER, exist_ok=True)
def train() -> None: # main entry point to start the training and setup GPU device
# Setup environment trainer
init_trainer()
if TrainModelConfig.multi_gpu_version.upper() == 'DDP':
# Launch DDP on num_gpu GPUs' this should be the go to for parralel processing!
setup_device(device= TrainModelConfig.device, multi_gpu=MultiGPUMode.DISTRIBUTED_DATA_PARALLEL, num_gpus=TrainModelConfig.num_gpus)
elif TrainModelConfig.multi_gpu_version.upper() == 'DP':
# Launch DP on num_gpu GPUs' -> NOTE: not working (yet)
setup_device(multi_gpu=MultiGPUMode.DATA_PARALLEL, num_gpus=TrainModelConfig.num_gpus)
if name == 'main':
train()
Versions
No response
The text was updated successfully, but these errors were encountered: