Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code crash during multiple training epochs running DDP #2065

Open
TychoBomer opened this issue Nov 11, 2024 · 2 comments
Open

Code crash during multiple training epochs running DDP #2065

TychoBomer opened this issue Nov 11, 2024 · 2 comments

Comments

@TychoBomer
Copy link

💡 Your Question

So when training all seems fine but during training after a few epochs the training stops and i get an error looking like this:

torch.distributed.elastic.multiprocessing.api.SignalException: Process 22636 got signal: 2

Now the problem clearly is in DDP, but it happens during training what I find very strange. Also it does not always happens as the training has also been completed once for 999 iterations. Now it mostly crashes around 30-80 epochs.

I hav two GPUS and i do see that they are initially both working so the DDP must be setup correctly but then it just stops

import os
from super_gradients import init_trainer
from super_gradients.training import models, Trainer
from super_gradients.common import MultiGPUMode # needed for parallel processing
from super_gradients.training.utils.distributed_training_utils import setup_device
from super_gradients.training.dataloaders.dataloaders import coco_detection_yolo_format_train, coco_detection_yolo_format_val
from super_gradients.training.losses import PPYoloELoss
from super_gradients.training.metrics import DetectionMetrics_050, DetectionMetrics_050_095, DetectionMetrics_075
from super_gradients.training.models.detection_models.pp_yolo_e import PPYoloEPostPredictionCallback

Train configuration files

from configurations import TrainModelConfig, TrainParamConfig

def _train():
# NOTE CHECKPOINT_DIR MUST BE AN ABSOLUTE PATH FOR MODEL LOADING!!
# Get the absolute path of the current script or working directory
project_root = os.path.dirname(os.path.abspath(file))
CHECKPOINT_DIR : str = os.path.join(project_root, TrainModelConfig.checkpoints_dir_name)
CHECKPOINT_DIR_FOLDER : str = os.path.abspath(CHECKPOINT_DIR)
os.makedirs(CHECKPOINT_DIR_FOLDER, exist_ok=True)

classes_file = os.path.join(TrainModelConfig.dataset_folder_location, "classes.txt")
CLASSES = [line.strip() for line in open(classes_file)]

# Build trainer object
trainer = Trainer(experiment_name=TrainModelConfig.experiment_name, ckpt_root_dir=CHECKPOINT_DIR_FOLDER)
# Specify model
model = models.get(
    TrainModelConfig.model_version, 
    num_classes=len(CLASSES),
    pretrained_weights='coco').to(TrainModelConfig.device)

# Dataset parameters for the dataloader
dataset_params = {
    'data_dir': TrainModelConfig.dataset_folder_location,
    'train_images_dir': 'train/images',
    'train_labels_dir': 'train/labels',
    'val_images_dir': 'val/images',
    'val_labels_dir': 'val/labels',
    'test_images_dir': 'test/images',
    'test_labels_dir': 'test/labels',
    'classes': CLASSES,
    'input_dim': TrainModelConfig.input_dim # change to desired input dimension #TODO: extracting from train image sizes? -> they seem to differ in the set..
    
}


# setting up dataloaders
train_data = coco_detection_yolo_format_train(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        # 'input_dim': dataset_params['input_dim'],
        'images_dir': dataset_params['train_images_dir'],
        'labels_dir': dataset_params['train_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size': TrainModelConfig.batch_size,
        'num_workers': TrainModelConfig.num_workers
    }
)


val_data = coco_detection_yolo_format_val(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        # 'input_dim': dataset_params['input_dim'],
        'images_dir': dataset_params['val_images_dir'],
        'labels_dir': dataset_params['val_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size': TrainModelConfig.batch_size,
        'num_workers': TrainModelConfig.num_workers
    }
)

test_data = coco_detection_yolo_format_val(
    dataset_params={
        'data_dir': dataset_params['data_dir'],
        # 'input_dim': dataset_params['input_dim'],
        'images_dir': dataset_params['test_images_dir'],
        'labels_dir': dataset_params['test_labels_dir'],
        'classes': dataset_params['classes']
    },
    dataloader_params={
        'batch_size': TrainModelConfig.batch_size,
        'num_workers': TrainModelConfig.num_workers
    }
)

# REMOVE MIXUP from data augmentation list
train_data.dataset.transforms.pop(2)

# training parameter setup
train_params = {
    'run_validation_freq': TrainParamConfig.run_validation_freq,
    'run_test_freq': TrainParamConfig.run_test_freq ,
    'silent_mode': TrainParamConfig.silent_mode,
    "average_best_models": TrainParamConfig.average_best_models,
    "warmup_mode": TrainParamConfig.warmup_mode,
    "warmup_initial_lr": TrainParamConfig.warmup_initial_lr,
    "lr_warmup_epochs": TrainParamConfig.lr_warmup_epochs,
    "initial_lr": TrainParamConfig.initial_lr,
    "lr_mode": TrainParamConfig.lr_mode,
    "cosine_final_lr_ratio": TrainParamConfig.cosine_final_lr_ratio,
    "optimizer": TrainParamConfig.optimizer,
    "optimizer_params": TrainParamConfig.optimizer_params,
    "zero_weight_decay_on_bias_and_bn": TrainParamConfig.zero_weight_decay_on_bias_and_bn,
    "ema": TrainParamConfig.ema,
    "ema_params": TrainParamConfig.ema_params,
    "max_epochs": TrainParamConfig.max_epochs,
    "mixed_precision": TrainParamConfig.mixed_precision,
    "loss": PPYoloELoss(
        use_static_assigner=False,
        num_classes=len(dataset_params['classes']),
        reg_max=16
    ),
    "valid_metrics_list": 
    [
    DetectionMetrics_050(
        score_thres=0.4,
        top_k_predictions=300,
        num_cls=len(dataset_params['classes']),
        normalize_targets=True,
        post_prediction_callback=PPYoloEPostPredictionCallback(
            score_threshold=0.01,
            nms_top_k=1000,
            max_predictions=300,
            nms_threshold=0.7
        ),
        calc_best_score_thresholds=True

    ),
    DetectionMetrics_050_095(
        score_thres=0.4,
        top_k_predictions=300,
        num_cls=len(dataset_params['classes']),
        normalize_targets=True,
        post_prediction_callback=PPYoloEPostPredictionCallback(
            score_threshold=0.01,
            nms_top_k=1000,
            max_predictions=300,
            nms_threshold=0.7
        ),
        calc_best_score_thresholds=True
    )
    # NOTE _075 not yet supported
    # DetectionMetrics_075(
    #     score_thres=0.1,
    #     top_k_predictions=300,
    #     num_cls=len(dataset_params['classes']),
    #     normalize_targets=True,
    #     post_prediction_callback=PPYoloEPostPredictionCallback(
    #         score_threshold=0.01,
    #         nms_top_k=1000,
    #         max_predictions=300,
    #         nms_threshold=0.7
    #     )
    # )
    ],
    "metric_to_watch": '[email protected]',
    "sg_logger": "clearml_sg_logger",
    "sg_logger_params":                 # Params that will be passes to __init__ of the logger super_gradients.common.sg_loggers.wandb_sg_logger.ClearMLSGLogger 
    {
        "project_name": "BHTDefectDetection", # ClearML project name
        "save_checkpoints_remote": False,
        "save_tensorboard_remote": True,
        "save_logs_remote": True,
    }
}


# Start the trainer using setup (see configurations.py)
trainer.train(
    model=model,
    training_params=train_params,
    train_loader=train_data,
    valid_loader=val_data,
    test_loaders={'test_set': test_data}
)

def train() -> None: # main entry point to start the training and setup GPU device
# Setup environment trainer
init_trainer()
if TrainModelConfig.multi_gpu_version.upper() == 'DDP':
# Launch DDP on num_gpu GPUs' this should be the go to for parralel processing!
setup_device(device= TrainModelConfig.device, multi_gpu=MultiGPUMode.DISTRIBUTED_DATA_PARALLEL, num_gpus=TrainModelConfig.num_gpus)
elif TrainModelConfig.multi_gpu_version.upper() == 'DP':
# Launch DP on num_gpu GPUs' -> NOTE: not working (yet)
setup_device(multi_gpu=MultiGPUMode.DATA_PARALLEL, num_gpus=TrainModelConfig.num_gpus)

# Call _train()
_train()

if name == 'main':
train()

Versions

No response

@msciancalepore98
Copy link

How are you launching your training process? This can happen if you use nohup or even multiplexers like tmux

@TychoBomer
Copy link
Author

How are you launching your training process? This can happen if you use nohup or even multiplexers like tmux

This was indeed the problem, thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants