Code crash during multiple training epochs running DDP #2065

TychoBomer · 2024-11-11T15:24:32Z

💡 Your Question

So when training all seems fine but during training after a few epochs the training stops and i get an error looking like this:

torch.distributed.elastic.multiprocessing.api.SignalException: Process 22636 got signal: 2

Now the problem clearly is in DDP, but it happens during training what I find very strange. Also it does not always happens as the training has also been completed once for 999 iterations. Now it mostly crashes around 30-80 epochs.

I hav two GPUS and i do see that they are initially both working so the DDP must be setup correctly but then it just stops

import os
from super_gradients import init_trainer
from super_gradients.training import models, Trainer
from super_gradients.common import MultiGPUMode # needed for parallel processing
from super_gradients.training.utils.distributed_training_utils import setup_device
from super_gradients.training.dataloaders.dataloaders import coco_detection_yolo_format_train, coco_detection_yolo_format_val
from super_gradients.training.losses import PPYoloELoss
from super_gradients.training.metrics import DetectionMetrics_050, DetectionMetrics_050_095, DetectionMetrics_075
from super_gradients.training.models.detection_models.pp_yolo_e import PPYoloEPostPredictionCallback

Train configuration files

from configurations import TrainModelConfig, TrainParamConfig

def _train():
# NOTE CHECKPOINT_DIR MUST BE AN ABSOLUTE PATH FOR MODEL LOADING!!
# Get the absolute path of the current script or working directory
project_root = os.path.dirname(os.path.abspath(file))
CHECKPOINT_DIR : str = os.path.join(project_root, TrainModelConfig.checkpoints_dir_name)
CHECKPOINT_DIR_FOLDER : str = os.path.abspath(CHECKPOINT_DIR)
os.makedirs(CHECKPOINT_DIR_FOLDER, exist_ok=True)

classes_file = os.path.join(TrainModelConfig.dataset_folder_location, "classes.txt")
CLASSES = [line.strip() for line in open(classes_file)]

# Build trainer object
trainer = Trainer(experiment_name=TrainModelConfig.experiment_name, ckpt_root_dir=CHECKPOINT_DIR_FOLDER)
# Specify model
model = models.get(
    TrainModelConfig.model_version, 
    num_classes=len(CLASSES),
    pretrained_weights='coco').to(TrainModelConfig.device)

# Dataset parameters for the dataloader
dataset_params = {
    'data_dir': TrainModelConfig.dataset_folder_location,
    'train_images_dir': 'train/images',
    'train_labels_dir': 'train/labels',
    'val_images_dir': 'val/images',
    'val_labels_dir': 'val/labels',
    'test_images_dir': 'test/images',
    'test_labels_dir': 'test/labels',
    'classes': CLASSES,
    'input_dim': TrainModelConfig.input_dim # change to desired input dimension #TODO: extracting from train image sizes? -> they seem to differ in the set..
    
}


# setting up dataloaders
train_data = coco_detection_yolo_format_train(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        # 'input_dim': dataset_params['input_dim'],
        'images_dir': dataset_params['train_images_dir'],
        'labels_dir': dataset_params['train_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size': TrainModelConfig.batch_size,
        'num_workers': TrainModelConfig.num_workers
    }
)


val_data = coco_detection_yolo_format_val(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        # 'input_dim': dataset_params['input_dim'],
        'images_dir': dataset_params['val_images_dir'],
        'labels_dir': dataset_params['val_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size': TrainModelConfig.batch_size,
        'num_workers': TrainModelConfig.num_workers
    }
)

test_data = coco_detection_yolo_format_val(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        # 'input_dim': dataset_params['input_dim'],
        'images_dir': dataset_params['test_images_dir'],
        'labels_dir': dataset_params['test_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size': TrainModelConfig.batch_size,
        'num_workers': TrainModelConfig.num_workers
    }
)

# REMOVE MIXUP from data augmentation list
train_data.dataset.transforms.pop(2)

# training parameter setup
train_params = {
    'run_validation_freq': TrainParamConfig.run_validation_freq,
    'run_test_freq': TrainParamConfig.run_test_freq ,
    'silent_mode': TrainParamConfig.silent_mode,
    "average_best_models": TrainParamConfig.average_best_models,
    "warmup_mode": TrainParamConfig.warmup_mode,
    "warmup_initial_lr": TrainParamConfig.warmup_initial_lr,
    "lr_warmup_epochs": TrainParamConfig.lr_warmup_epochs,
    "initial_lr": TrainParamConfig.initial_lr,
    "lr_mode": TrainParamConfig.lr_mode,
    "cosine_final_lr_ratio": TrainParamConfig.cosine_final_lr_ratio,
    "optimizer": TrainParamConfig.optimizer,
    "optimizer_params": TrainParamConfig.optimizer_params,
    "zero_weight_decay_on_bias_and_bn": TrainParamConfig.zero_weight_decay_on_bias_and_bn,
    "ema": TrainParamConfig.ema,
    "ema_params": TrainParamConfig.ema_params,
    "max_epochs": TrainParamConfig.max_epochs,
    "mixed_precision": TrainParamConfig.mixed_precision,
    "loss": PPYoloELoss(
        use_static_assigner=False,
        num_classes=len(dataset_params['classes']),
        reg_max=16
    ),
    "valid_metrics_list": 
    [
    DetectionMetrics_050(
        score_thres=0.4,
        top_k_predictions=300,
        num_cls=len(dataset_params['classes']),
        normalize_targets=True,
        post_prediction_callback=PPYoloEPostPredictionCallback(
            score_threshold=0.01,
            nms_top_k=1000,
            max_predictions=300,
            nms_threshold=0.7
        ),
        calc_best_score_thresholds=True

    ),
    DetectionMetrics_050_095(
        score_thres=0.4,
        top_k_predictions=300,
        num_cls=len(dataset_params['classes']),
        normalize_targets=True,
        post_prediction_callback=PPYoloEPostPredictionCallback(
            score_threshold=0.01,
            nms_top_k=1000,
            max_predictions=300,
            nms_threshold=0.7
        ),
        calc_best_score_thresholds=True
    )
    # NOTE _075 not yet supported
    # DetectionMetrics_075(
    #     score_thres=0.1,
    #     top_k_predictions=300,
    #     num_cls=len(dataset_params['classes']),
    #     normalize_targets=True,
    #     post_prediction_callback=PPYoloEPostPredictionCallback(
    #         score_threshold=0.01,
    #         nms_top_k=1000,
    #         max_predictions=300,
    #         nms_threshold=0.7
    #     )
    # )
    ],
    "metric_to_watch": '[email protected]',
    "sg_logger": "clearml_sg_logger",
    "sg_logger_params":                 # Params that will be passes to __init__ of the logger super_gradients.common.sg_loggers.wandb_sg_logger.ClearMLSGLogger 
    {
        "project_name": "BHTDefectDetection", # ClearML project name
        "save_checkpoints_remote": False,
        "save_tensorboard_remote": True,
        "save_logs_remote": True,
    }
}


# Start the trainer using setup (see configurations.py)
trainer.train(
    model=model,
    training_params=train_params,
    train_loader=train_data,
    valid_loader=val_data,
    test_loaders={'test_set': test_data}
)

def train() -> None: # main entry point to start the training and setup GPU device
# Setup environment trainer
init_trainer()
if TrainModelConfig.multi_gpu_version.upper() == 'DDP':
# Launch DDP on num_gpu GPUs' this should be the go to for parralel processing!
setup_device(device= TrainModelConfig.device, multi_gpu=MultiGPUMode.DISTRIBUTED_DATA_PARALLEL, num_gpus=TrainModelConfig.num_gpus)
elif TrainModelConfig.multi_gpu_version.upper() == 'DP':
# Launch DP on num_gpu GPUs' -> NOTE: not working (yet)
setup_device(multi_gpu=MultiGPUMode.DATA_PARALLEL, num_gpus=TrainModelConfig.num_gpus)

# Call _train()
_train()

if name == 'main':
train()

Versions

No response

The text was updated successfully, but these errors were encountered:

msciancalepore98 · 2024-11-12T10:57:39Z

How are you launching your training process? This can happen if you use nohup or even multiplexers like tmux

TychoBomer · 2024-11-13T13:08:21Z

How are you launching your training process? This can happen if you use nohup or even multiplexers like tmux

This was indeed the problem, thanks!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Code crash during multiple training epochs running DDP #2065

Code crash during multiple training epochs running DDP #2065

TychoBomer commented Nov 11, 2024

msciancalepore98 commented Nov 12, 2024

TychoBomer commented Nov 13, 2024

Code crash during multiple training epochs running DDP #2065

Code crash during multiple training epochs running DDP #2065

Comments

TychoBomer commented Nov 11, 2024

💡 Your Question

Train configuration files

Versions

msciancalepore98 commented Nov 12, 2024

TychoBomer commented Nov 13, 2024