From 3b0f7b292533f9d7e2665b1b8dd6138afe610c82 Mon Sep 17 00:00:00 2001 From: nerkulec Date: Thu, 14 Sep 2023 20:29:10 +0200 Subject: [PATCH 1/4] Multiple GPUs using DataParallel --- .../basic/ex07_train_network_multiple_gpus.py | 94 +++++++++++++++++++ mala/common/parameters.py | 1 + mala/network/network.py | 4 +- mala/network/runner.py | 5 +- mala/network/trainer.py | 52 ++++++++-- 5 files changed, 146 insertions(+), 10 deletions(-) create mode 100644 examples/basic/ex07_train_network_multiple_gpus.py diff --git a/examples/basic/ex07_train_network_multiple_gpus.py b/examples/basic/ex07_train_network_multiple_gpus.py new file mode 100644 index 000000000..38d5720d0 --- /dev/null +++ b/examples/basic/ex07_train_network_multiple_gpus.py @@ -0,0 +1,94 @@ +import os + +import mala + +from mala.datahandling.data_repo import data_repo_path +data_path = os.path.join(data_repo_path, "Be2") + +""" +This example shows how a neural network can be trained on material +data using this framework. It uses preprocessed data, that is read in +from *.npy files. +""" + + +#################### +# 1. PARAMETERS +# The first step of each MALA workflow is to define a parameters object and +# select the necessary parameters for the application one wants to look into. +#################### + +parameters = mala.Parameters() +# Specify the data scaling. For regular bispectrum and LDOS data, +# these have proven successful. +parameters.data.input_rescaling_type = "feature-wise-standard" +parameters.data.output_rescaling_type = "normal" +# Specify the used activation function. +parameters.network.layer_activations = ["ReLU"] +# Specify the training parameters. +# These may be determined via hyperparameter tuning. +parameters.running.max_number_epochs = 100 +parameters.running.mini_batch_size = 40 +parameters.running.learning_rate = 0.00001 +parameters.running.trainingtype = "Adam" + +parameters.use_gpu = True +parameters.running.num_gpus = 4 +# These parameters characterize how the LDOS and bispectrum descriptors +# were calculated. They are _technically_ not needed to train a simple +# network. However, it is useful to define them prior to training. Then, +# when using the network later in production, all required parameters are +# already set. +parameters.targets.target_type = "LDOS" +parameters.targets.ldos_gridsize = 11 +parameters.targets.ldos_gridspacing_ev = 2.5 +parameters.targets.ldos_gridoffset_ev = -5 + +parameters.descriptors.descriptor_type = "Bispectrum" +parameters.descriptors.bispectrum_twojmax = 10 +parameters.descriptors.bispectrum_cutoff = 4.67637 + +#################### +# 2. DATA +# Data has to be added to the MALA workflow. The central object for this +# is the DataHandler class, which takes care of all data needs. After data +# has been added, it is loaded and scaled with the prepare_data function. +#################### + +data_handler = mala.DataHandler(parameters) +# Add a snapshot we want to use in to the list. +data_handler.add_snapshot("Be_snapshot0.in.npy", data_path, + "Be_snapshot0.out.npy", data_path, "tr") +data_handler.add_snapshot("Be_snapshot1.in.npy", data_path, + "Be_snapshot1.out.npy", data_path, "va") +data_handler.prepare_data() + +#################### +# 3. NETWORK SETUP +# Now we can set up the NN to be used in the ML-DFT model. The layer_sizes +# list determines the number of neurons in the NN. It can be specified before +# loading data, but it is recommended to do that afterwards, since then +# the input_dimension and output_dimension properties of the data handling +# class can be used to correctly define input and output layer of the NN. +#################### + +parameters.network.layer_sizes = [data_handler.input_dimension, + 100, + data_handler.output_dimension] +test_network = mala.Network(parameters) + +#################### +# 4. TRAINING THE NETWORK +# Finally, the network can be trained. Afterwards, it can easily be saved +# into a .zip archive for inference. It is recommended to load a file +# containing additional calculation data (e.g., from the QE calculations +# with which the LDOS data was created) so that things like simulated +# temperature, information about the pseudopotential, etc. are stored along- +# side the model. This makes inference easier. +#################### + +test_trainer = mala.Trainer(parameters, test_network, data_handler) +test_trainer.train_network() +additional_calculation_data = os.path.join(data_path, "Be_snapshot0.out") +test_trainer.save_run("be_model", + additional_calculation_data=additional_calculation_data) diff --git a/mala/common/parameters.py b/mala/common/parameters.py index d63149193..0b08a82e3 100644 --- a/mala/common/parameters.py +++ b/mala/common/parameters.py @@ -712,6 +712,7 @@ def __init__(self): self.max_number_epochs = 100 self.verbosity = True self.mini_batch_size = 10 + self.num_gpus = 1 self.weight_decay = 0 self.early_stopping_epochs = 0 self.early_stopping_threshold = 0 diff --git a/mala/network/network.py b/mala/network/network.py index 521b7c35f..87883d041 100644 --- a/mala/network/network.py +++ b/mala/network/network.py @@ -28,7 +28,7 @@ class Network(nn.Module): Parameters used to create this neural network. """ - def __new__(cls, params: Parameters): + def __new__(cls, params: Parameters=None): """ Create a neural network instance. @@ -38,7 +38,7 @@ def __new__(cls, params: Parameters): Parameters ---------- - params : mala.common.parametes.Parameters + params : mala.common.parameters.Parameters Parameters used to create this neural network. """ model = None diff --git a/mala/network/runner.py b/mala/network/runner.py index 5367c2a7c..91ff02ca3 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -81,7 +81,10 @@ def save_run(self, run_name, save_path="./", zip_run=True, optimizer_file = run_name+".optimizer.pth" self.parameters_full.save(os.path.join(save_path, params_file)) - self.network.save_network(os.path.join(save_path, model_file)) + if hasattr(self.network, "save_network"): + self.network.save_network(os.path.join(save_path, model_file)) + else: + self.network.module.save_network(os.path.join(save_path, model_file)) self.data.input_data_scaler.save(os.path.join(save_path, iscaler_file)) self.data.output_data_scaler.save(os.path.join(save_path, oscaler_file)) diff --git a/mala/network/trainer.py b/mala/network/trainer.py index 98dc291b8..dcd6b6147 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -630,6 +630,15 @@ def __prepare_to_train(self, optimizer_dict): mini_batch_size * 1, sampler=self.test_sampler, **kwargs)) + + + + if self.parameters_full.use_gpu and self.parameters_full.running.num_gpus > 1: + if self.parameters_full.network.nn_type != "feed-forward": + raise Exception("Only feed-forward networks are supported " + "with multiple GPUs.") + self.network = torch.nn.DataParallel(self.network, + device_ids=list(range(self.parameters_full.running.num_gpus))) def __process_mini_batch(self, network, input_data, target_data): """Process a mini batch.""" @@ -645,7 +654,10 @@ def __process_mini_batch(self, network, input_data, target_data): with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision): prediction = network(input_data) - loss = network.calculate_loss(prediction, target_data) + if hasattr(network, "calculate_loss"): + loss = network.calculate_loss(prediction, target_data) + else: + loss = network.module.calculate_loss(prediction, target_data) if self.gradscaler: self.gradscaler.scale(loss).backward() @@ -666,6 +678,11 @@ def __process_mini_batch(self, network, input_data, target_data): self.static_loss = network.calculate_loss(self.static_prediction, self.static_target_data) + if hasattr(network, "calculate_loss"): + self.static_loss = network.calculate_loss(self.static_prediction, self.static_target_data) + else: + self.static_loss = network.module.calculate_loss(self.static_prediction, self.static_target_data) + if self.gradscaler: self.gradscaler.scale(self.static_loss).backward() else: @@ -688,7 +705,10 @@ def __process_mini_batch(self, network, input_data, target_data): torch.cuda.nvtx.range_pop() torch.cuda.nvtx.range_push("loss") - loss = network.calculate_loss(prediction, target_data) + if hasattr(network, "calculate_loss"): + loss = network.calculate_loss(prediction, target_data) + else: + loss = network.module.calculate_loss(prediction, target_data) # loss torch.cuda.nvtx.range_pop() @@ -711,7 +731,10 @@ def __process_mini_batch(self, network, input_data, target_data): return loss else: prediction = network(input_data) - loss = network.calculate_loss(prediction, target_data) + if hasattr(network, "calculate_loss"): + loss = network.calculate_loss(prediction, target_data) + else: + loss = network.module.calculate_loss(prediction, target_data) loss.backward() self.optimizer.step() self.optimizer.zero_grad() @@ -761,7 +784,10 @@ def __validate_network(self, network, data_set_type, validation_type): for _ in range(20): with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision): prediction = network(x) - loss = network.calculate_loss(prediction, y) + if hasattr(network, "calculate_loss"): + loss = network.calculate_loss(prediction, y) + else: + loss = network.module.calculate_loss(prediction, y) torch.cuda.current_stream().wait_stream(s) # Create static entry point tensors to graph @@ -774,6 +800,10 @@ def __validate_network(self, network, data_set_type, validation_type): with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision): self.static_prediction_validation = network(self.static_input_validation) self.static_loss_validation = network.calculate_loss(self.static_prediction_validation, self.static_target_validation) + if hasattr(network, "calculate_loss"): + self.static_loss_validation = network.calculate_loss(self.static_prediction_validation, self.static_target_validation) + else: + self.static_loss_validation = network.module.calculate_loss(self.static_prediction_validation, self.static_target_validation) if self.validation_graph: self.static_input_validation.copy_(x) @@ -783,7 +813,10 @@ def __validate_network(self, network, data_set_type, validation_type): else: with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision): prediction = network(x) - loss = network.calculate_loss(prediction, y) + if hasattr(network, "calculate_loss"): + loss = network.calculate_loss(prediction, y) + else: + loss = network.module.calculate_loss(prediction, y) validation_loss_sum += loss if batchid != 0 and (batchid + 1) % report_freq == 0: torch.cuda.synchronize() @@ -804,8 +837,13 @@ def __validate_network(self, network, data_set_type, validation_type): x = x.to(self.parameters._configuration["device"]) y = y.to(self.parameters._configuration["device"]) prediction = network(x) - validation_loss_sum += \ - network.calculate_loss(prediction, y).item() + + if hasattr(network, "calculate_loss"): + loss = network.calculate_loss(prediction, y) + else: + loss = network.module.calculate_loss(prediction, y) + + validation_loss_sum += loss.item() batchid += 1 validation_loss = validation_loss_sum.item() / batchid From 65d11b113b72bcfa1b4312118be27430dc4b881d Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 25 Apr 2024 09:21:27 +0200 Subject: [PATCH 2/4] Deleted example and combined parameters --- .../basic/ex07_train_network_multiple_gpus.py | 94 ------------------ mala/common/parameters.py | 19 ++-- mala/descriptors/descriptor.py | 2 +- mala/network/network.py | 4 +- mala/network/runner.py | 6 +- mala/network/trainer.py | 96 +++++++++++++------ 6 files changed, 84 insertions(+), 137 deletions(-) delete mode 100644 examples/basic/ex07_train_network_multiple_gpus.py diff --git a/examples/basic/ex07_train_network_multiple_gpus.py b/examples/basic/ex07_train_network_multiple_gpus.py deleted file mode 100644 index 38d5720d0..000000000 --- a/examples/basic/ex07_train_network_multiple_gpus.py +++ /dev/null @@ -1,94 +0,0 @@ -import os - -import mala - -from mala.datahandling.data_repo import data_repo_path -data_path = os.path.join(data_repo_path, "Be2") - -""" -This example shows how a neural network can be trained on material -data using this framework. It uses preprocessed data, that is read in -from *.npy files. -""" - - -#################### -# 1. PARAMETERS -# The first step of each MALA workflow is to define a parameters object and -# select the necessary parameters for the application one wants to look into. -#################### - -parameters = mala.Parameters() -# Specify the data scaling. For regular bispectrum and LDOS data, -# these have proven successful. -parameters.data.input_rescaling_type = "feature-wise-standard" -parameters.data.output_rescaling_type = "normal" -# Specify the used activation function. -parameters.network.layer_activations = ["ReLU"] -# Specify the training parameters. -# These may be determined via hyperparameter tuning. -parameters.running.max_number_epochs = 100 -parameters.running.mini_batch_size = 40 -parameters.running.learning_rate = 0.00001 -parameters.running.trainingtype = "Adam" - -parameters.use_gpu = True -parameters.running.num_gpus = 4 -# These parameters characterize how the LDOS and bispectrum descriptors -# were calculated. They are _technically_ not needed to train a simple -# network. However, it is useful to define them prior to training. Then, -# when using the network later in production, all required parameters are -# already set. -parameters.targets.target_type = "LDOS" -parameters.targets.ldos_gridsize = 11 -parameters.targets.ldos_gridspacing_ev = 2.5 -parameters.targets.ldos_gridoffset_ev = -5 - -parameters.descriptors.descriptor_type = "Bispectrum" -parameters.descriptors.bispectrum_twojmax = 10 -parameters.descriptors.bispectrum_cutoff = 4.67637 - -#################### -# 2. DATA -# Data has to be added to the MALA workflow. The central object for this -# is the DataHandler class, which takes care of all data needs. After data -# has been added, it is loaded and scaled with the prepare_data function. -#################### - -data_handler = mala.DataHandler(parameters) -# Add a snapshot we want to use in to the list. -data_handler.add_snapshot("Be_snapshot0.in.npy", data_path, - "Be_snapshot0.out.npy", data_path, "tr") -data_handler.add_snapshot("Be_snapshot1.in.npy", data_path, - "Be_snapshot1.out.npy", data_path, "va") -data_handler.prepare_data() - -#################### -# 3. NETWORK SETUP -# Now we can set up the NN to be used in the ML-DFT model. The layer_sizes -# list determines the number of neurons in the NN. It can be specified before -# loading data, but it is recommended to do that afterwards, since then -# the input_dimension and output_dimension properties of the data handling -# class can be used to correctly define input and output layer of the NN. -#################### - -parameters.network.layer_sizes = [data_handler.input_dimension, - 100, - data_handler.output_dimension] -test_network = mala.Network(parameters) - -#################### -# 4. TRAINING THE NETWORK -# Finally, the network can be trained. Afterwards, it can easily be saved -# into a .zip archive for inference. It is recommended to load a file -# containing additional calculation data (e.g., from the QE calculations -# with which the LDOS data was created) so that things like simulated -# temperature, information about the pseudopotential, etc. are stored along- -# side the model. This makes inference easier. -#################### - -test_trainer = mala.Trainer(parameters, test_network, data_handler) -test_trainer.train_network() -additional_calculation_data = os.path.join(data_path, "Be_snapshot0.out") -test_trainer.save_run("be_model", - additional_calculation_data=additional_calculation_data) diff --git a/mala/common/parameters.py b/mala/common/parameters.py index 9f3ce7b2c..1444b8750 100644 --- a/mala/common/parameters.py +++ b/mala/common/parameters.py @@ -39,7 +39,7 @@ def __init__( ): super(ParametersBase, self).__init__() self._configuration = { - "gpu": False, + "gpu": 0, "horovod": False, "mpi": False, "device": "cpu", @@ -744,7 +744,6 @@ def __init__(self): self.max_number_epochs = 100 self.verbosity = True self.mini_batch_size = 10 - self.num_gpus = 1 self.weight_decay = 0 self.early_stopping_epochs = 0 self.early_stopping_threshold = 0 @@ -845,10 +844,7 @@ def use_graphs(self): @use_graphs.setter def use_graphs(self, value): if value is True: - if ( - self._configuration["gpu"] is False - or torch.version.cuda is None - ): + if self._configuration["gpu"] == 0 or torch.version.cuda is None: parallel_warn("No CUDA or GPU found, cannot use CUDA graphs.") value = False else: @@ -1284,11 +1280,14 @@ def use_gpu(self): @use_gpu.setter def use_gpu(self, value): - if value is False: - self._use_gpu = False + if value is False or value == 0: + self._use_gpu = 0 else: if torch.cuda.is_available(): - self._use_gpu = True + if value is True: + self._use_gpu = 1 + else: + self._use_gpu = value else: parallel_warn( "GPU requested, but no GPU found. MALA will " @@ -1536,7 +1535,7 @@ def optuna_singlenode_setup(self, wait_time=0): """ # We first "trick" the parameters object to assume MPI and GPUs # are used. That way we get the right device. - self.use_gpu = True + self.use_gpu = 1 self.use_mpi = True device_temp = self.device sleep(get_rank() * wait_time) diff --git a/mala/descriptors/descriptor.py b/mala/descriptors/descriptor.py index 0c055a4e0..b292c0ab7 100644 --- a/mala/descriptors/descriptor.py +++ b/mala/descriptors/descriptor.py @@ -757,7 +757,7 @@ def _setup_lammps( lammps_dict["ngridy"] = ny lammps_dict["ngridz"] = nz lammps_dict["switch"] = self.parameters.bispectrum_switchflag - if self.parameters._configuration["gpu"]: + if self.parameters._configuration["gpu"] > 0: # Tell Kokkos to use one GPU. lmp_cmdargs.append("-k") lmp_cmdargs.append("on") diff --git a/mala/network/network.py b/mala/network/network.py index b433a1aca..847f47549 100644 --- a/mala/network/network.py +++ b/mala/network/network.py @@ -30,7 +30,7 @@ class Network(nn.Module): Parameters used to create this neural network. """ - def __new__(cls, params: Parameters=None): + def __new__(cls, params: Parameters = None): """ Create a neural network instance. @@ -454,7 +454,7 @@ def __init__(self, params): self.params.layer_activations[0] ]() - if params.use_gpu: + if params.use_gpu > 0: self.to("cuda") def forward(self, x): diff --git a/mala/network/runner.py b/mala/network/runner.py index 33c1f8558..f7e0be697 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -91,7 +91,9 @@ def save_run( if hasattr(self.network, "save_network"): self.network.save_network(os.path.join(save_path, model_file)) else: - self.network.module.save_network(os.path.join(save_path, model_file)) + self.network.module.save_network( + os.path.join(save_path, model_file) + ) self.data.input_data_scaler.save(os.path.join(save_path, iscaler_file)) self.data.output_data_scaler.save( os.path.join(save_path, oscaler_file) @@ -428,7 +430,7 @@ def __prepare_to_run(self): """ # See if we want to use horovod. if self.parameters_full.use_horovod: - if self.parameters_full.use_gpu: + if self.parameters_full.use_gpu > 0: # We cannot use "printout" here because this is supposed # to happen on every rank. if self.parameters_full.verbosity >= 2: diff --git a/mala/network/trainer.py b/mala/network/trainer.py index c3123768b..01632a380 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -308,7 +308,7 @@ def train_network(self): if isinstance(self.data.training_data_sets[0], FastTensorDataset): self.data.training_data_sets[0].shuffle() - if self.parameters._configuration["gpu"]: + if self.parameters._configuration["gpu"] > 0: torch.cuda.synchronize( self.parameters._configuration["device"] ) @@ -445,7 +445,7 @@ def train_network(self): # to disk self.tensor_board.close() - if self.parameters._configuration["gpu"]: + if self.parameters._configuration["gpu"] > 0: torch.cuda.synchronize( self.parameters._configuration["device"] ) @@ -454,7 +454,7 @@ def train_network(self): # in the lazy loading case). if self.parameters.use_shuffling_for_samplers: self.data.mix_datasets() - if self.parameters._configuration["gpu"]: + if self.parameters._configuration["gpu"] > 0: torch.cuda.synchronize( self.parameters._configuration["device"] ) @@ -559,7 +559,7 @@ def __prepare_to_train(self, optimizer_dict): "num_workers": self.parameters.num_workers, "pin_memory": False, } - if self.parameters_full.use_gpu: + if self.parameters_full.use_gpu > 0: kwargs["pin_memory"] = True # Read last epoch @@ -776,17 +776,20 @@ def __prepare_to_train(self, optimizer_dict): ) ) - - if self.parameters_full.use_gpu and self.parameters_full.running.num_gpus > 1: + if self.parameters_full.use_gpu > 1: if self.parameters_full.network.nn_type != "feed-forward": - raise Exception("Only feed-forward networks are supported " - "with multiple GPUs.") - self.network = torch.nn.DataParallel(self.network, - device_ids=list(range(self.parameters_full.running.num_gpus))) + raise Exception( + "Only feed-forward networks are supported " + "with multiple GPUs." + ) + self.network = torch.nn.DataParallel( + self.network, + device_ids=list(range(self.parameters_full.use_gpu)), + ) def __process_mini_batch(self, network, input_data, target_data): """Process a mini batch.""" - if self.parameters._configuration["gpu"]: + if self.parameters._configuration["gpu"] > 0: if self.parameters.use_graphs and self.train_graph is None: printout("Capturing CUDA graph for training.", min_verbosity=2) s = torch.cuda.Stream(self.parameters._configuration["device"]) @@ -808,9 +811,13 @@ def __process_mini_batch(self, network, input_data, target_data): prediction, target_data ) if hasattr(network, "calculate_loss"): - loss = network.calculate_loss(prediction, target_data) + loss = network.calculate_loss( + prediction, target_data + ) else: - loss = network.module.calculate_loss(prediction, target_data) + loss = network.module.calculate_loss( + prediction, target_data + ) if self.gradscaler: self.gradscaler.scale(loss).backward() @@ -840,9 +847,13 @@ def __process_mini_batch(self, network, input_data, target_data): ) if hasattr(network, "calculate_loss"): - self.static_loss = network.calculate_loss(self.static_prediction, self.static_target_data) + self.static_loss = network.calculate_loss( + self.static_prediction, self.static_target_data + ) else: - self.static_loss = network.module.calculate_loss(self.static_prediction, self.static_target_data) + self.static_loss = network.module.calculate_loss( + self.static_prediction, self.static_target_data + ) if self.gradscaler: self.gradscaler.scale(self.static_loss).backward() @@ -871,7 +882,9 @@ def __process_mini_batch(self, network, input_data, target_data): if hasattr(network, "calculate_loss"): loss = network.calculate_loss(prediction, target_data) else: - loss = network.module.calculate_loss(prediction, target_data) + loss = network.module.calculate_loss( + prediction, target_data + ) # loss torch.cuda.nvtx.range_pop() @@ -930,7 +943,7 @@ def __validate_network(self, network, data_set_type, validation_type): 1, device=self.parameters._configuration["device"] ) with torch.no_grad(): - if self.parameters._configuration["gpu"]: + if self.parameters._configuration["gpu"] > 0: report_freq = self.parameters.training_report_frequency torch.cuda.synchronize( self.parameters._configuration["device"] @@ -973,10 +986,16 @@ def __validate_network(self, network, data_set_type, validation_type): enabled=self.parameters.use_mixed_precision ): prediction = network(x) - if hasattr(network, "calculate_loss"): - loss = network.calculate_loss(prediction, y) + if hasattr( + network, "calculate_loss" + ): + loss = network.calculate_loss( + prediction, y + ) else: - loss = network.module.calculate_loss(prediction, y) + loss = network.module.calculate_loss( + prediction, y + ) torch.cuda.current_stream( self.parameters._configuration["device"] ).wait_stream(s) @@ -992,13 +1011,28 @@ def __validate_network(self, network, data_set_type, validation_type): # Capture graph self.validation_graph = torch.cuda.CUDAGraph() with torch.cuda.graph(self.validation_graph): - with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision): - self.static_prediction_validation = network(self.static_input_validation) - self.static_loss_validation = network.calculate_loss(self.static_prediction_validation, self.static_target_validation) + with torch.cuda.amp.autocast( + enabled=self.parameters.use_mixed_precision + ): + self.static_prediction_validation = ( + network( + self.static_input_validation + ) + ) + self.static_loss_validation = network.calculate_loss( + self.static_prediction_validation, + self.static_target_validation, + ) if hasattr(network, "calculate_loss"): - self.static_loss_validation = network.calculate_loss(self.static_prediction_validation, self.static_target_validation) + self.static_loss_validation = network.calculate_loss( + self.static_prediction_validation, + self.static_target_validation, + ) else: - self.static_loss_validation = network.module.calculate_loss(self.static_prediction_validation, self.static_target_validation) + self.static_loss_validation = network.module.calculate_loss( + self.static_prediction_validation, + self.static_target_validation, + ) with torch.cuda.amp.autocast( enabled=self.parameters.use_mixed_precision ): @@ -1025,9 +1059,13 @@ def __validate_network(self, network, data_set_type, validation_type): ): prediction = network(x) if hasattr(network, "calculate_loss"): - loss = network.calculate_loss(prediction, y) + loss = network.calculate_loss( + prediction, y + ) else: - loss = network.module.calculate_loss(prediction, y) + loss = network.module.calculate_loss( + prediction, y + ) validation_loss_sum += loss if ( batchid != 0 @@ -1063,7 +1101,9 @@ def __validate_network(self, network, data_set_type, validation_type): if hasattr(network, "calculate_loss"): loss = network.calculate_loss(prediction, y) else: - loss = network.module.calculate_loss(prediction, y) + loss = network.module.calculate_loss( + prediction, y + ) validation_loss_sum += loss.item() batchid += 1 From e4f2eed36e406cc709fa1886ef66b501055f8a4b Mon Sep 17 00:00:00 2001 From: nerkulec Date: Thu, 25 Apr 2024 11:11:07 +0200 Subject: [PATCH 3/4] Change hasattr check to module --- mala/network/runner.py | 6 +++--- mala/network/trainer.py | 48 ++++++++++++++++++++--------------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/mala/network/runner.py b/mala/network/runner.py index f7e0be697..83d97fc60 100644 --- a/mala/network/runner.py +++ b/mala/network/runner.py @@ -88,12 +88,12 @@ def save_run( optimizer_file = run_name + ".optimizer.pth" self.parameters_full.save(os.path.join(save_path, params_file)) - if hasattr(self.network, "save_network"): - self.network.save_network(os.path.join(save_path, model_file)) - else: + if hasattr(self.network, "module"): self.network.module.save_network( os.path.join(save_path, model_file) ) + else: + self.network.save_network(os.path.join(save_path, model_file)) self.data.input_data_scaler.save(os.path.join(save_path, iscaler_file)) self.data.output_data_scaler.save( os.path.join(save_path, oscaler_file) diff --git a/mala/network/trainer.py b/mala/network/trainer.py index 01632a380..3221041f6 100644 --- a/mala/network/trainer.py +++ b/mala/network/trainer.py @@ -810,12 +810,12 @@ def __process_mini_batch(self, network, input_data, target_data): loss = network.calculate_loss( prediction, target_data ) - if hasattr(network, "calculate_loss"): - loss = network.calculate_loss( + if hasattr(network, "module"): + loss = network.module.calculate_loss( prediction, target_data ) else: - loss = network.module.calculate_loss( + loss = network.calculate_loss( prediction, target_data ) @@ -846,12 +846,12 @@ def __process_mini_batch(self, network, input_data, target_data): self.static_prediction, self.static_target_data ) - if hasattr(network, "calculate_loss"): - self.static_loss = network.calculate_loss( + if hasattr(network, "module"): + self.static_loss = network.module.calculate_loss( self.static_prediction, self.static_target_data ) else: - self.static_loss = network.module.calculate_loss( + self.static_loss = network.calculate_loss( self.static_prediction, self.static_target_data ) @@ -879,12 +879,12 @@ def __process_mini_batch(self, network, input_data, target_data): torch.cuda.nvtx.range_pop() torch.cuda.nvtx.range_push("loss") - if hasattr(network, "calculate_loss"): - loss = network.calculate_loss(prediction, target_data) - else: + if hasattr(network, "module"): loss = network.module.calculate_loss( prediction, target_data ) + else: + loss = network.calculate_loss(prediction, target_data) # loss torch.cuda.nvtx.range_pop() @@ -907,10 +907,10 @@ def __process_mini_batch(self, network, input_data, target_data): return loss else: prediction = network(input_data) - if hasattr(network, "calculate_loss"): - loss = network.calculate_loss(prediction, target_data) - else: + if hasattr(network, "module"): loss = network.module.calculate_loss(prediction, target_data) + else: + loss = network.calculate_loss(prediction, target_data) loss.backward() self.optimizer.step() self.optimizer.zero_grad() @@ -987,13 +987,13 @@ def __validate_network(self, network, data_set_type, validation_type): ): prediction = network(x) if hasattr( - network, "calculate_loss" + network, "module" ): - loss = network.calculate_loss( + loss = network.module.calculate_loss( prediction, y ) else: - loss = network.module.calculate_loss( + loss = network.calculate_loss( prediction, y ) torch.cuda.current_stream( @@ -1023,13 +1023,13 @@ def __validate_network(self, network, data_set_type, validation_type): self.static_prediction_validation, self.static_target_validation, ) - if hasattr(network, "calculate_loss"): - self.static_loss_validation = network.calculate_loss( + if hasattr(network, "module"): + self.static_loss_validation = network.module.calculate_loss( self.static_prediction_validation, self.static_target_validation, ) else: - self.static_loss_validation = network.module.calculate_loss( + self.static_loss_validation = network.calculate_loss( self.static_prediction_validation, self.static_target_validation, ) @@ -1058,12 +1058,12 @@ def __validate_network(self, network, data_set_type, validation_type): enabled=self.parameters.use_mixed_precision ): prediction = network(x) - if hasattr(network, "calculate_loss"): - loss = network.calculate_loss( + if hasattr(network, "module"): + loss = network.module.calculate_loss( prediction, y ) else: - loss = network.module.calculate_loss( + loss = network.calculate_loss( prediction, y ) validation_loss_sum += loss @@ -1098,12 +1098,12 @@ def __validate_network(self, network, data_set_type, validation_type): y = y.to(self.parameters._configuration["device"]) prediction = network(x) - if hasattr(network, "calculate_loss"): - loss = network.calculate_loss(prediction, y) - else: + if hasattr(network, "module"): loss = network.module.calculate_loss( prediction, y ) + else: + loss = network.calculate_loss(prediction, y) validation_loss_sum += loss.item() batchid += 1 From 0d43cba047cc73585b7b5e7ba7032532ee4cb587 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 25 Apr 2024 14:56:42 +0200 Subject: [PATCH 4/4] Added documentation on DP usage --- docs/source/advanced_usage/trainingmodel.rst | 4 +++- mala/common/parameters.py | 13 ++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst index ddb429368..8a353289f 100644 --- a/docs/source/advanced_usage/trainingmodel.rst +++ b/docs/source/advanced_usage/trainingmodel.rst @@ -22,7 +22,9 @@ GPU usage via parameters.use_gpu = True Afterwards, the entire training will be performed on the GPU - given that -a GPU is available. +a GPU is available. You can also set ``parameters.use_gpu`` to a specific number, +e.g., ``parameters.use_gpu = 4`` to use all available GPUs per compute node, +if multiple GPUs are attached (in this case, 4 GPUs are assumed). In cooperation with `Nvidia `_, advanced GPU performance optimizations have been implemented into MALA. diff --git a/mala/common/parameters.py b/mala/common/parameters.py index 1444b8750..070a10100 100644 --- a/mala/common/parameters.py +++ b/mala/common/parameters.py @@ -1275,7 +1275,18 @@ def verbosity(self, value): @property def use_gpu(self): - """Control whether or not a GPU is used (provided there is one).""" + """ + Control whether a GPU is used (provided there is one). + + Can either be False/True or an integer. If set to False or 0, + no GPU will be used. For numbers higher than 0, the behavior differs + between training and inference. For training, the number of GPUs + set here will be used, i.e., if use_gpu=4, MALA will attempt to use + 4 GPUs for training. For inference, only one GPU will be used per + MPI rank, such that the total number of GPUs used is determined by + the number of MPI ranks. Therefore, each number higher than 0 will be + treated the same in inference. + """ return self._use_gpu @use_gpu.setter