From 3b0f7b292533f9d7e2665b1b8dd6138afe610c82 Mon Sep 17 00:00:00 2001
From: nerkulec <nerkulec@github.com>
Date: Thu, 14 Sep 2023 20:29:10 +0200
Subject: [PATCH 1/4] Multiple GPUs using DataParallel

---
 .../basic/ex07_train_network_multiple_gpus.py | 94 +++++++++++++++++++
 mala/common/parameters.py                     |  1 +
 mala/network/network.py                       |  4 +-
 mala/network/runner.py                        |  5 +-
 mala/network/trainer.py                       | 52 ++++++++--
 5 files changed, 146 insertions(+), 10 deletions(-)
 create mode 100644 examples/basic/ex07_train_network_multiple_gpus.py

diff --git a/examples/basic/ex07_train_network_multiple_gpus.py b/examples/basic/ex07_train_network_multiple_gpus.py
new file mode 100644
index 000000000..38d5720d0
--- /dev/null
+++ b/examples/basic/ex07_train_network_multiple_gpus.py
@@ -0,0 +1,94 @@
+import os
+
+import mala
+
+from mala.datahandling.data_repo import data_repo_path
+data_path = os.path.join(data_repo_path, "Be2")
+
+"""
+This example shows how a neural network can be trained on material
+data using this framework. It uses preprocessed data, that is read in
+from *.npy files.
+"""
+
+
+####################
+# 1. PARAMETERS
+# The first step of each MALA workflow is to define a parameters object and
+# select the necessary parameters for the application one wants to look into.
+####################
+
+parameters = mala.Parameters()
+# Specify the data scaling. For regular bispectrum and LDOS data,
+# these have proven successful.
+parameters.data.input_rescaling_type = "feature-wise-standard"
+parameters.data.output_rescaling_type = "normal"
+# Specify the used activation function.
+parameters.network.layer_activations = ["ReLU"]
+# Specify the training parameters.
+# These may be determined via hyperparameter tuning.
+parameters.running.max_number_epochs = 100
+parameters.running.mini_batch_size = 40
+parameters.running.learning_rate = 0.00001
+parameters.running.trainingtype = "Adam"
+
+parameters.use_gpu = True
+parameters.running.num_gpus = 4
+# These parameters characterize how the LDOS and bispectrum descriptors
+# were calculated. They are _technically_ not needed to train a simple
+# network. However, it is useful to define them prior to training. Then,
+# when using the network later in production, all required parameters are
+# already set.
+parameters.targets.target_type = "LDOS"
+parameters.targets.ldos_gridsize = 11
+parameters.targets.ldos_gridspacing_ev = 2.5
+parameters.targets.ldos_gridoffset_ev = -5
+
+parameters.descriptors.descriptor_type = "Bispectrum"
+parameters.descriptors.bispectrum_twojmax = 10
+parameters.descriptors.bispectrum_cutoff = 4.67637
+
+####################
+# 2. DATA
+# Data has to be added to the MALA workflow. The central object for this
+# is the DataHandler class, which takes care of all data needs. After data
+# has been added, it is loaded and scaled with the prepare_data function.
+####################
+
+data_handler = mala.DataHandler(parameters)
+# Add a snapshot we want to use in to the list.
+data_handler.add_snapshot("Be_snapshot0.in.npy", data_path,
+                          "Be_snapshot0.out.npy", data_path, "tr")
+data_handler.add_snapshot("Be_snapshot1.in.npy", data_path,
+                          "Be_snapshot1.out.npy", data_path, "va")
+data_handler.prepare_data()
+
+####################
+# 3. NETWORK SETUP
+# Now we can set up the NN to be used in the ML-DFT model. The layer_sizes
+# list determines the number of neurons in the NN. It can be specified before
+# loading data, but it is recommended to do that afterwards, since then
+# the input_dimension and output_dimension properties of the data handling
+# class can be used to correctly define input and output layer of the NN.
+####################
+
+parameters.network.layer_sizes = [data_handler.input_dimension,
+                                  100,
+                                  data_handler.output_dimension]
+test_network = mala.Network(parameters)
+
+####################
+# 4. TRAINING THE NETWORK
+# Finally, the network can be trained. Afterwards, it can easily be saved
+# into a .zip archive for inference. It is recommended to load a file
+# containing additional calculation data (e.g., from the QE calculations
+# with which the LDOS data was created) so that things like simulated
+# temperature, information about the pseudopotential, etc. are stored along-
+# side the model. This makes inference easier.
+####################
+
+test_trainer = mala.Trainer(parameters, test_network, data_handler)
+test_trainer.train_network()
+additional_calculation_data = os.path.join(data_path, "Be_snapshot0.out")
+test_trainer.save_run("be_model",
+                      additional_calculation_data=additional_calculation_data)
diff --git a/mala/common/parameters.py b/mala/common/parameters.py
index d63149193..0b08a82e3 100644
--- a/mala/common/parameters.py
+++ b/mala/common/parameters.py
@@ -712,6 +712,7 @@ def __init__(self):
         self.max_number_epochs = 100
         self.verbosity = True
         self.mini_batch_size = 10
+        self.num_gpus = 1
         self.weight_decay = 0
         self.early_stopping_epochs = 0
         self.early_stopping_threshold = 0
diff --git a/mala/network/network.py b/mala/network/network.py
index 521b7c35f..87883d041 100644
--- a/mala/network/network.py
+++ b/mala/network/network.py
@@ -28,7 +28,7 @@ class Network(nn.Module):
         Parameters used to create this neural network.
     """
 
-    def __new__(cls, params: Parameters):
+    def __new__(cls, params: Parameters=None):
         """
         Create a neural network instance.
 
@@ -38,7 +38,7 @@ def __new__(cls, params: Parameters):
 
         Parameters
         ----------
-        params : mala.common.parametes.Parameters
+        params : mala.common.parameters.Parameters
             Parameters used to create this neural network.
         """
         model = None
diff --git a/mala/network/runner.py b/mala/network/runner.py
index 5367c2a7c..91ff02ca3 100644
--- a/mala/network/runner.py
+++ b/mala/network/runner.py
@@ -81,7 +81,10 @@ def save_run(self, run_name, save_path="./", zip_run=True,
             optimizer_file = run_name+".optimizer.pth"
 
         self.parameters_full.save(os.path.join(save_path, params_file))
-        self.network.save_network(os.path.join(save_path, model_file))
+        if hasattr(self.network, "save_network"):
+            self.network.save_network(os.path.join(save_path, model_file))
+        else:
+            self.network.module.save_network(os.path.join(save_path, model_file))
         self.data.input_data_scaler.save(os.path.join(save_path, iscaler_file))
         self.data.output_data_scaler.save(os.path.join(save_path,
                                                        oscaler_file))
diff --git a/mala/network/trainer.py b/mala/network/trainer.py
index 98dc291b8..dcd6b6147 100644
--- a/mala/network/trainer.py
+++ b/mala/network/trainer.py
@@ -630,6 +630,15 @@ def __prepare_to_train(self, optimizer_dict):
                                                          mini_batch_size * 1,
                                                          sampler=self.test_sampler,
                                                          **kwargs))
+        
+
+
+        if self.parameters_full.use_gpu and self.parameters_full.running.num_gpus > 1:
+            if self.parameters_full.network.nn_type != "feed-forward":
+                raise Exception("Only feed-forward networks are supported "
+                                "with multiple GPUs.")
+            self.network = torch.nn.DataParallel(self.network,
+                device_ids=list(range(self.parameters_full.running.num_gpus)))
 
     def __process_mini_batch(self, network, input_data, target_data):
         """Process a mini batch."""
@@ -645,7 +654,10 @@ def __process_mini_batch(self, network, input_data, target_data):
 
                         with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision):
                             prediction = network(input_data)
-                            loss = network.calculate_loss(prediction, target_data)
+                            if hasattr(network, "calculate_loss"):
+                                loss = network.calculate_loss(prediction, target_data)
+                            else:
+                                loss = network.module.calculate_loss(prediction, target_data)
 
                         if self.gradscaler:
                             self.gradscaler.scale(loss).backward()
@@ -666,6 +678,11 @@ def __process_mini_batch(self, network, input_data, target_data):
 
                         self.static_loss = network.calculate_loss(self.static_prediction, self.static_target_data)
 
+                        if hasattr(network, "calculate_loss"):
+                            self.static_loss = network.calculate_loss(self.static_prediction, self.static_target_data)
+                        else:
+                            self.static_loss = network.module.calculate_loss(self.static_prediction, self.static_target_data)
+
                     if self.gradscaler:
                         self.gradscaler.scale(self.static_loss).backward()
                     else:
@@ -688,7 +705,10 @@ def __process_mini_batch(self, network, input_data, target_data):
                     torch.cuda.nvtx.range_pop()
 
                     torch.cuda.nvtx.range_push("loss")
-                    loss = network.calculate_loss(prediction, target_data)
+                    if hasattr(network, "calculate_loss"):
+                        loss = network.calculate_loss(prediction, target_data)
+                    else:
+                        loss = network.module.calculate_loss(prediction, target_data)
                     # loss
                     torch.cuda.nvtx.range_pop()
 
@@ -711,7 +731,10 @@ def __process_mini_batch(self, network, input_data, target_data):
                 return loss
         else:
             prediction = network(input_data)
-            loss = network.calculate_loss(prediction, target_data)
+            if hasattr(network, "calculate_loss"):
+                loss = network.calculate_loss(prediction, target_data)
+            else:
+                loss = network.module.calculate_loss(prediction, target_data)
             loss.backward()
             self.optimizer.step()
             self.optimizer.zero_grad()
@@ -761,7 +784,10 @@ def __validate_network(self, network, data_set_type, validation_type):
                                     for _ in range(20):
                                         with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision):
                                             prediction = network(x)
-                                            loss = network.calculate_loss(prediction, y)
+                                            if hasattr(network, "calculate_loss"):
+                                                loss = network.calculate_loss(prediction, y)
+                                            else:
+                                                loss = network.module.calculate_loss(prediction, y)
                                 torch.cuda.current_stream().wait_stream(s)
 
                                 # Create static entry point tensors to graph
@@ -774,6 +800,10 @@ def __validate_network(self, network, data_set_type, validation_type):
                                     with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision):
                                         self.static_prediction_validation = network(self.static_input_validation)
                                         self.static_loss_validation = network.calculate_loss(self.static_prediction_validation, self.static_target_validation)
+                                        if hasattr(network, "calculate_loss"):
+                                            self.static_loss_validation = network.calculate_loss(self.static_prediction_validation, self.static_target_validation)
+                                        else:
+                                            self.static_loss_validation = network.module.calculate_loss(self.static_prediction_validation, self.static_target_validation)
 
                             if self.validation_graph:
                                 self.static_input_validation.copy_(x)
@@ -783,7 +813,10 @@ def __validate_network(self, network, data_set_type, validation_type):
                             else:
                                 with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision):
                                     prediction = network(x)
-                                    loss = network.calculate_loss(prediction, y)
+                                    if hasattr(network, "calculate_loss"):
+                                        loss = network.calculate_loss(prediction, y)
+                                    else:
+                                        loss = network.module.calculate_loss(prediction, y)
                                     validation_loss_sum += loss
                             if batchid != 0 and (batchid + 1) % report_freq == 0:
                                 torch.cuda.synchronize()
@@ -804,8 +837,13 @@ def __validate_network(self, network, data_set_type, validation_type):
                             x = x.to(self.parameters._configuration["device"])
                             y = y.to(self.parameters._configuration["device"])
                             prediction = network(x)
-                            validation_loss_sum += \
-                                network.calculate_loss(prediction, y).item()
+
+                            if hasattr(network, "calculate_loss"):
+                                loss = network.calculate_loss(prediction, y)
+                            else:
+                                loss = network.module.calculate_loss(prediction, y)
+                            
+                            validation_loss_sum += loss.item()
                             batchid += 1
 
             validation_loss = validation_loss_sum.item() / batchid

From 65d11b113b72bcfa1b4312118be27430dc4b881d Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 25 Apr 2024 09:21:27 +0200
Subject: [PATCH 2/4] Deleted example and combined parameters

---
 .../basic/ex07_train_network_multiple_gpus.py | 94 ------------------
 mala/common/parameters.py                     | 19 ++--
 mala/descriptors/descriptor.py                |  2 +-
 mala/network/network.py                       |  4 +-
 mala/network/runner.py                        |  6 +-
 mala/network/trainer.py                       | 96 +++++++++++++------
 6 files changed, 84 insertions(+), 137 deletions(-)
 delete mode 100644 examples/basic/ex07_train_network_multiple_gpus.py

diff --git a/examples/basic/ex07_train_network_multiple_gpus.py b/examples/basic/ex07_train_network_multiple_gpus.py
deleted file mode 100644
index 38d5720d0..000000000
--- a/examples/basic/ex07_train_network_multiple_gpus.py
+++ /dev/null
@@ -1,94 +0,0 @@
-import os
-
-import mala
-
-from mala.datahandling.data_repo import data_repo_path
-data_path = os.path.join(data_repo_path, "Be2")
-
-"""
-This example shows how a neural network can be trained on material
-data using this framework. It uses preprocessed data, that is read in
-from *.npy files.
-"""
-
-
-####################
-# 1. PARAMETERS
-# The first step of each MALA workflow is to define a parameters object and
-# select the necessary parameters for the application one wants to look into.
-####################
-
-parameters = mala.Parameters()
-# Specify the data scaling. For regular bispectrum and LDOS data,
-# these have proven successful.
-parameters.data.input_rescaling_type = "feature-wise-standard"
-parameters.data.output_rescaling_type = "normal"
-# Specify the used activation function.
-parameters.network.layer_activations = ["ReLU"]
-# Specify the training parameters.
-# These may be determined via hyperparameter tuning.
-parameters.running.max_number_epochs = 100
-parameters.running.mini_batch_size = 40
-parameters.running.learning_rate = 0.00001
-parameters.running.trainingtype = "Adam"
-
-parameters.use_gpu = True
-parameters.running.num_gpus = 4
-# These parameters characterize how the LDOS and bispectrum descriptors
-# were calculated. They are _technically_ not needed to train a simple
-# network. However, it is useful to define them prior to training. Then,
-# when using the network later in production, all required parameters are
-# already set.
-parameters.targets.target_type = "LDOS"
-parameters.targets.ldos_gridsize = 11
-parameters.targets.ldos_gridspacing_ev = 2.5
-parameters.targets.ldos_gridoffset_ev = -5
-
-parameters.descriptors.descriptor_type = "Bispectrum"
-parameters.descriptors.bispectrum_twojmax = 10
-parameters.descriptors.bispectrum_cutoff = 4.67637
-
-####################
-# 2. DATA
-# Data has to be added to the MALA workflow. The central object for this
-# is the DataHandler class, which takes care of all data needs. After data
-# has been added, it is loaded and scaled with the prepare_data function.
-####################
-
-data_handler = mala.DataHandler(parameters)
-# Add a snapshot we want to use in to the list.
-data_handler.add_snapshot("Be_snapshot0.in.npy", data_path,
-                          "Be_snapshot0.out.npy", data_path, "tr")
-data_handler.add_snapshot("Be_snapshot1.in.npy", data_path,
-                          "Be_snapshot1.out.npy", data_path, "va")
-data_handler.prepare_data()
-
-####################
-# 3. NETWORK SETUP
-# Now we can set up the NN to be used in the ML-DFT model. The layer_sizes
-# list determines the number of neurons in the NN. It can be specified before
-# loading data, but it is recommended to do that afterwards, since then
-# the input_dimension and output_dimension properties of the data handling
-# class can be used to correctly define input and output layer of the NN.
-####################
-
-parameters.network.layer_sizes = [data_handler.input_dimension,
-                                  100,
-                                  data_handler.output_dimension]
-test_network = mala.Network(parameters)
-
-####################
-# 4. TRAINING THE NETWORK
-# Finally, the network can be trained. Afterwards, it can easily be saved
-# into a .zip archive for inference. It is recommended to load a file
-# containing additional calculation data (e.g., from the QE calculations
-# with which the LDOS data was created) so that things like simulated
-# temperature, information about the pseudopotential, etc. are stored along-
-# side the model. This makes inference easier.
-####################
-
-test_trainer = mala.Trainer(parameters, test_network, data_handler)
-test_trainer.train_network()
-additional_calculation_data = os.path.join(data_path, "Be_snapshot0.out")
-test_trainer.save_run("be_model",
-                      additional_calculation_data=additional_calculation_data)
diff --git a/mala/common/parameters.py b/mala/common/parameters.py
index 9f3ce7b2c..1444b8750 100644
--- a/mala/common/parameters.py
+++ b/mala/common/parameters.py
@@ -39,7 +39,7 @@ def __init__(
     ):
         super(ParametersBase, self).__init__()
         self._configuration = {
-            "gpu": False,
+            "gpu": 0,
             "horovod": False,
             "mpi": False,
             "device": "cpu",
@@ -744,7 +744,6 @@ def __init__(self):
         self.max_number_epochs = 100
         self.verbosity = True
         self.mini_batch_size = 10
-        self.num_gpus = 1
         self.weight_decay = 0
         self.early_stopping_epochs = 0
         self.early_stopping_threshold = 0
@@ -845,10 +844,7 @@ def use_graphs(self):
     @use_graphs.setter
     def use_graphs(self, value):
         if value is True:
-            if (
-                self._configuration["gpu"] is False
-                or torch.version.cuda is None
-            ):
+            if self._configuration["gpu"] == 0 or torch.version.cuda is None:
                 parallel_warn("No CUDA or GPU found, cannot use CUDA graphs.")
                 value = False
             else:
@@ -1284,11 +1280,14 @@ def use_gpu(self):
 
     @use_gpu.setter
     def use_gpu(self, value):
-        if value is False:
-            self._use_gpu = False
+        if value is False or value == 0:
+            self._use_gpu = 0
         else:
             if torch.cuda.is_available():
-                self._use_gpu = True
+                if value is True:
+                    self._use_gpu = 1
+                else:
+                    self._use_gpu = value
             else:
                 parallel_warn(
                     "GPU requested, but no GPU found. MALA will "
@@ -1536,7 +1535,7 @@ def optuna_singlenode_setup(self, wait_time=0):
         """
         # We first "trick" the parameters object to assume MPI and GPUs
         # are used. That way we get the right device.
-        self.use_gpu = True
+        self.use_gpu = 1
         self.use_mpi = True
         device_temp = self.device
         sleep(get_rank() * wait_time)
diff --git a/mala/descriptors/descriptor.py b/mala/descriptors/descriptor.py
index 0c055a4e0..b292c0ab7 100644
--- a/mala/descriptors/descriptor.py
+++ b/mala/descriptors/descriptor.py
@@ -757,7 +757,7 @@ def _setup_lammps(
             lammps_dict["ngridy"] = ny
             lammps_dict["ngridz"] = nz
             lammps_dict["switch"] = self.parameters.bispectrum_switchflag
-            if self.parameters._configuration["gpu"]:
+            if self.parameters._configuration["gpu"] > 0:
                 # Tell Kokkos to use one GPU.
                 lmp_cmdargs.append("-k")
                 lmp_cmdargs.append("on")
diff --git a/mala/network/network.py b/mala/network/network.py
index b433a1aca..847f47549 100644
--- a/mala/network/network.py
+++ b/mala/network/network.py
@@ -30,7 +30,7 @@ class Network(nn.Module):
         Parameters used to create this neural network.
     """
 
-    def __new__(cls, params: Parameters=None):
+    def __new__(cls, params: Parameters = None):
         """
         Create a neural network instance.
 
@@ -454,7 +454,7 @@ def __init__(self, params):
             self.params.layer_activations[0]
         ]()
 
-        if params.use_gpu:
+        if params.use_gpu > 0:
             self.to("cuda")
 
     def forward(self, x):
diff --git a/mala/network/runner.py b/mala/network/runner.py
index 33c1f8558..f7e0be697 100644
--- a/mala/network/runner.py
+++ b/mala/network/runner.py
@@ -91,7 +91,9 @@ def save_run(
         if hasattr(self.network, "save_network"):
             self.network.save_network(os.path.join(save_path, model_file))
         else:
-            self.network.module.save_network(os.path.join(save_path, model_file))
+            self.network.module.save_network(
+                os.path.join(save_path, model_file)
+            )
         self.data.input_data_scaler.save(os.path.join(save_path, iscaler_file))
         self.data.output_data_scaler.save(
             os.path.join(save_path, oscaler_file)
@@ -428,7 +430,7 @@ def __prepare_to_run(self):
         """
         # See if we want to use horovod.
         if self.parameters_full.use_horovod:
-            if self.parameters_full.use_gpu:
+            if self.parameters_full.use_gpu > 0:
                 # We cannot use "printout" here because this is supposed
                 # to happen on every rank.
                 if self.parameters_full.verbosity >= 2:
diff --git a/mala/network/trainer.py b/mala/network/trainer.py
index c3123768b..01632a380 100644
--- a/mala/network/trainer.py
+++ b/mala/network/trainer.py
@@ -308,7 +308,7 @@ def train_network(self):
             if isinstance(self.data.training_data_sets[0], FastTensorDataset):
                 self.data.training_data_sets[0].shuffle()
 
-            if self.parameters._configuration["gpu"]:
+            if self.parameters._configuration["gpu"] > 0:
                 torch.cuda.synchronize(
                     self.parameters._configuration["device"]
                 )
@@ -445,7 +445,7 @@ def train_network(self):
                 # to disk
                 self.tensor_board.close()
 
-            if self.parameters._configuration["gpu"]:
+            if self.parameters._configuration["gpu"] > 0:
                 torch.cuda.synchronize(
                     self.parameters._configuration["device"]
                 )
@@ -454,7 +454,7 @@ def train_network(self):
             # in the lazy loading case).
             if self.parameters.use_shuffling_for_samplers:
                 self.data.mix_datasets()
-            if self.parameters._configuration["gpu"]:
+            if self.parameters._configuration["gpu"] > 0:
                 torch.cuda.synchronize(
                     self.parameters._configuration["device"]
                 )
@@ -559,7 +559,7 @@ def __prepare_to_train(self, optimizer_dict):
             "num_workers": self.parameters.num_workers,
             "pin_memory": False,
         }
-        if self.parameters_full.use_gpu:
+        if self.parameters_full.use_gpu > 0:
             kwargs["pin_memory"] = True
 
         # Read last epoch
@@ -776,17 +776,20 @@ def __prepare_to_train(self, optimizer_dict):
                     )
                 )
 
-
-        if self.parameters_full.use_gpu and self.parameters_full.running.num_gpus > 1:
+        if self.parameters_full.use_gpu > 1:
             if self.parameters_full.network.nn_type != "feed-forward":
-                raise Exception("Only feed-forward networks are supported "
-                                "with multiple GPUs.")
-            self.network = torch.nn.DataParallel(self.network,
-                device_ids=list(range(self.parameters_full.running.num_gpus)))
+                raise Exception(
+                    "Only feed-forward networks are supported "
+                    "with multiple GPUs."
+                )
+            self.network = torch.nn.DataParallel(
+                self.network,
+                device_ids=list(range(self.parameters_full.use_gpu)),
+            )
 
     def __process_mini_batch(self, network, input_data, target_data):
         """Process a mini batch."""
-        if self.parameters._configuration["gpu"]:
+        if self.parameters._configuration["gpu"] > 0:
             if self.parameters.use_graphs and self.train_graph is None:
                 printout("Capturing CUDA graph for training.", min_verbosity=2)
                 s = torch.cuda.Stream(self.parameters._configuration["device"])
@@ -808,9 +811,13 @@ def __process_mini_batch(self, network, input_data, target_data):
                                 prediction, target_data
                             )
                             if hasattr(network, "calculate_loss"):
-                                loss = network.calculate_loss(prediction, target_data)
+                                loss = network.calculate_loss(
+                                    prediction, target_data
+                                )
                             else:
-                                loss = network.module.calculate_loss(prediction, target_data)
+                                loss = network.module.calculate_loss(
+                                    prediction, target_data
+                                )
 
                         if self.gradscaler:
                             self.gradscaler.scale(loss).backward()
@@ -840,9 +847,13 @@ def __process_mini_batch(self, network, input_data, target_data):
                         )
 
                         if hasattr(network, "calculate_loss"):
-                            self.static_loss = network.calculate_loss(self.static_prediction, self.static_target_data)
+                            self.static_loss = network.calculate_loss(
+                                self.static_prediction, self.static_target_data
+                            )
                         else:
-                            self.static_loss = network.module.calculate_loss(self.static_prediction, self.static_target_data)
+                            self.static_loss = network.module.calculate_loss(
+                                self.static_prediction, self.static_target_data
+                            )
 
                     if self.gradscaler:
                         self.gradscaler.scale(self.static_loss).backward()
@@ -871,7 +882,9 @@ def __process_mini_batch(self, network, input_data, target_data):
                     if hasattr(network, "calculate_loss"):
                         loss = network.calculate_loss(prediction, target_data)
                     else:
-                        loss = network.module.calculate_loss(prediction, target_data)
+                        loss = network.module.calculate_loss(
+                            prediction, target_data
+                        )
                     # loss
                     torch.cuda.nvtx.range_pop()
 
@@ -930,7 +943,7 @@ def __validate_network(self, network, data_set_type, validation_type):
                 1, device=self.parameters._configuration["device"]
             )
             with torch.no_grad():
-                if self.parameters._configuration["gpu"]:
+                if self.parameters._configuration["gpu"] > 0:
                     report_freq = self.parameters.training_report_frequency
                     torch.cuda.synchronize(
                         self.parameters._configuration["device"]
@@ -973,10 +986,16 @@ def __validate_network(self, network, data_set_type, validation_type):
                                             enabled=self.parameters.use_mixed_precision
                                         ):
                                             prediction = network(x)
-                                            if hasattr(network, "calculate_loss"):
-                                                loss = network.calculate_loss(prediction, y)
+                                            if hasattr(
+                                                network, "calculate_loss"
+                                            ):
+                                                loss = network.calculate_loss(
+                                                    prediction, y
+                                                )
                                             else:
-                                                loss = network.module.calculate_loss(prediction, y)
+                                                loss = network.module.calculate_loss(
+                                                    prediction, y
+                                                )
                                 torch.cuda.current_stream(
                                     self.parameters._configuration["device"]
                                 ).wait_stream(s)
@@ -992,13 +1011,28 @@ def __validate_network(self, network, data_set_type, validation_type):
                                 # Capture graph
                                 self.validation_graph = torch.cuda.CUDAGraph()
                                 with torch.cuda.graph(self.validation_graph):
-                                    with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision):
-                                        self.static_prediction_validation = network(self.static_input_validation)
-                                        self.static_loss_validation = network.calculate_loss(self.static_prediction_validation, self.static_target_validation)
+                                    with torch.cuda.amp.autocast(
+                                        enabled=self.parameters.use_mixed_precision
+                                    ):
+                                        self.static_prediction_validation = (
+                                            network(
+                                                self.static_input_validation
+                                            )
+                                        )
+                                        self.static_loss_validation = network.calculate_loss(
+                                            self.static_prediction_validation,
+                                            self.static_target_validation,
+                                        )
                                         if hasattr(network, "calculate_loss"):
-                                            self.static_loss_validation = network.calculate_loss(self.static_prediction_validation, self.static_target_validation)
+                                            self.static_loss_validation = network.calculate_loss(
+                                                self.static_prediction_validation,
+                                                self.static_target_validation,
+                                            )
                                         else:
-                                            self.static_loss_validation = network.module.calculate_loss(self.static_prediction_validation, self.static_target_validation)
+                                            self.static_loss_validation = network.module.calculate_loss(
+                                                self.static_prediction_validation,
+                                                self.static_target_validation,
+                                            )
                                     with torch.cuda.amp.autocast(
                                         enabled=self.parameters.use_mixed_precision
                                     ):
@@ -1025,9 +1059,13 @@ def __validate_network(self, network, data_set_type, validation_type):
                                 ):
                                     prediction = network(x)
                                     if hasattr(network, "calculate_loss"):
-                                        loss = network.calculate_loss(prediction, y)
+                                        loss = network.calculate_loss(
+                                            prediction, y
+                                        )
                                     else:
-                                        loss = network.module.calculate_loss(prediction, y)
+                                        loss = network.module.calculate_loss(
+                                            prediction, y
+                                        )
                                     validation_loss_sum += loss
                             if (
                                 batchid != 0
@@ -1063,7 +1101,9 @@ def __validate_network(self, network, data_set_type, validation_type):
                             if hasattr(network, "calculate_loss"):
                                 loss = network.calculate_loss(prediction, y)
                             else:
-                                loss = network.module.calculate_loss(prediction, y)
+                                loss = network.module.calculate_loss(
+                                    prediction, y
+                                )
 
                             validation_loss_sum += loss.item()
                             batchid += 1

From e4f2eed36e406cc709fa1886ef66b501055f8a4b Mon Sep 17 00:00:00 2001
From: nerkulec <nerkulec@github.com>
Date: Thu, 25 Apr 2024 11:11:07 +0200
Subject: [PATCH 3/4] Change hasattr check to module

---
 mala/network/runner.py  |  6 +++---
 mala/network/trainer.py | 48 ++++++++++++++++++++---------------------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/mala/network/runner.py b/mala/network/runner.py
index f7e0be697..83d97fc60 100644
--- a/mala/network/runner.py
+++ b/mala/network/runner.py
@@ -88,12 +88,12 @@ def save_run(
             optimizer_file = run_name + ".optimizer.pth"
 
         self.parameters_full.save(os.path.join(save_path, params_file))
-        if hasattr(self.network, "save_network"):
-            self.network.save_network(os.path.join(save_path, model_file))
-        else:
+        if hasattr(self.network, "module"):
             self.network.module.save_network(
                 os.path.join(save_path, model_file)
             )
+        else:
+            self.network.save_network(os.path.join(save_path, model_file))
         self.data.input_data_scaler.save(os.path.join(save_path, iscaler_file))
         self.data.output_data_scaler.save(
             os.path.join(save_path, oscaler_file)
diff --git a/mala/network/trainer.py b/mala/network/trainer.py
index 01632a380..3221041f6 100644
--- a/mala/network/trainer.py
+++ b/mala/network/trainer.py
@@ -810,12 +810,12 @@ def __process_mini_batch(self, network, input_data, target_data):
                             loss = network.calculate_loss(
                                 prediction, target_data
                             )
-                            if hasattr(network, "calculate_loss"):
-                                loss = network.calculate_loss(
+                            if hasattr(network, "module"):
+                                loss = network.module.calculate_loss(
                                     prediction, target_data
                                 )
                             else:
-                                loss = network.module.calculate_loss(
+                                loss = network.calculate_loss(
                                     prediction, target_data
                                 )
 
@@ -846,12 +846,12 @@ def __process_mini_batch(self, network, input_data, target_data):
                             self.static_prediction, self.static_target_data
                         )
 
-                        if hasattr(network, "calculate_loss"):
-                            self.static_loss = network.calculate_loss(
+                        if hasattr(network, "module"):
+                            self.static_loss = network.module.calculate_loss(
                                 self.static_prediction, self.static_target_data
                             )
                         else:
-                            self.static_loss = network.module.calculate_loss(
+                            self.static_loss = network.calculate_loss(
                                 self.static_prediction, self.static_target_data
                             )
 
@@ -879,12 +879,12 @@ def __process_mini_batch(self, network, input_data, target_data):
                     torch.cuda.nvtx.range_pop()
 
                     torch.cuda.nvtx.range_push("loss")
-                    if hasattr(network, "calculate_loss"):
-                        loss = network.calculate_loss(prediction, target_data)
-                    else:
+                    if hasattr(network, "module"):
                         loss = network.module.calculate_loss(
                             prediction, target_data
                         )
+                    else:
+                        loss = network.calculate_loss(prediction, target_data)
                     # loss
                     torch.cuda.nvtx.range_pop()
 
@@ -907,10 +907,10 @@ def __process_mini_batch(self, network, input_data, target_data):
                 return loss
         else:
             prediction = network(input_data)
-            if hasattr(network, "calculate_loss"):
-                loss = network.calculate_loss(prediction, target_data)
-            else:
+            if hasattr(network, "module"):
                 loss = network.module.calculate_loss(prediction, target_data)
+            else:
+                loss = network.calculate_loss(prediction, target_data)
             loss.backward()
             self.optimizer.step()
             self.optimizer.zero_grad()
@@ -987,13 +987,13 @@ def __validate_network(self, network, data_set_type, validation_type):
                                         ):
                                             prediction = network(x)
                                             if hasattr(
-                                                network, "calculate_loss"
+                                                network, "module"
                                             ):
-                                                loss = network.calculate_loss(
+                                                loss = network.module.calculate_loss(
                                                     prediction, y
                                                 )
                                             else:
-                                                loss = network.module.calculate_loss(
+                                                loss = network.calculate_loss(
                                                     prediction, y
                                                 )
                                 torch.cuda.current_stream(
@@ -1023,13 +1023,13 @@ def __validate_network(self, network, data_set_type, validation_type):
                                             self.static_prediction_validation,
                                             self.static_target_validation,
                                         )
-                                        if hasattr(network, "calculate_loss"):
-                                            self.static_loss_validation = network.calculate_loss(
+                                        if hasattr(network, "module"):
+                                            self.static_loss_validation = network.module.calculate_loss(
                                                 self.static_prediction_validation,
                                                 self.static_target_validation,
                                             )
                                         else:
-                                            self.static_loss_validation = network.module.calculate_loss(
+                                            self.static_loss_validation = network.calculate_loss(
                                                 self.static_prediction_validation,
                                                 self.static_target_validation,
                                             )
@@ -1058,12 +1058,12 @@ def __validate_network(self, network, data_set_type, validation_type):
                                     enabled=self.parameters.use_mixed_precision
                                 ):
                                     prediction = network(x)
-                                    if hasattr(network, "calculate_loss"):
-                                        loss = network.calculate_loss(
+                                    if hasattr(network, "module"):
+                                        loss = network.module.calculate_loss(
                                             prediction, y
                                         )
                                     else:
-                                        loss = network.module.calculate_loss(
+                                        loss = network.calculate_loss(
                                             prediction, y
                                         )
                                     validation_loss_sum += loss
@@ -1098,12 +1098,12 @@ def __validate_network(self, network, data_set_type, validation_type):
                             y = y.to(self.parameters._configuration["device"])
                             prediction = network(x)
 
-                            if hasattr(network, "calculate_loss"):
-                                loss = network.calculate_loss(prediction, y)
-                            else:
+                            if hasattr(network, "module"):
                                 loss = network.module.calculate_loss(
                                     prediction, y
                                 )
+                            else:
+                                loss = network.calculate_loss(prediction, y)
 
                             validation_loss_sum += loss.item()
                             batchid += 1

From 0d43cba047cc73585b7b5e7ba7032532ee4cb587 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 25 Apr 2024 14:56:42 +0200
Subject: [PATCH 4/4] Added documentation on DP usage

---
 docs/source/advanced_usage/trainingmodel.rst |  4 +++-
 mala/common/parameters.py                    | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/docs/source/advanced_usage/trainingmodel.rst b/docs/source/advanced_usage/trainingmodel.rst
index ddb429368..8a353289f 100644
--- a/docs/source/advanced_usage/trainingmodel.rst
+++ b/docs/source/advanced_usage/trainingmodel.rst
@@ -22,7 +22,9 @@ GPU usage via
             parameters.use_gpu = True
 
 Afterwards, the entire training will be performed on the GPU - given that
-a GPU is available.
+a GPU is available. You can also set ``parameters.use_gpu`` to a specific number,
+e.g., ``parameters.use_gpu = 4`` to use all available GPUs per compute node,
+if multiple GPUs are attached (in this case, 4 GPUs are assumed).
 
 In cooperation with `Nvidia <https://www.nvidia.com/de-de/deep-learning-ai/solutions/machine-learning/>`_,
 advanced GPU performance optimizations have been implemented into MALA.
diff --git a/mala/common/parameters.py b/mala/common/parameters.py
index 1444b8750..070a10100 100644
--- a/mala/common/parameters.py
+++ b/mala/common/parameters.py
@@ -1275,7 +1275,18 @@ def verbosity(self, value):
 
     @property
     def use_gpu(self):
-        """Control whether or not a GPU is used (provided there is one)."""
+        """
+        Control whether a GPU is used (provided there is one).
+
+        Can either be False/True or an integer. If set to False or 0,
+        no GPU will be used. For numbers higher than 0, the behavior differs
+        between training and inference. For training, the number of GPUs
+        set here will be used, i.e., if use_gpu=4, MALA will attempt to use
+        4 GPUs for training. For inference, only one GPU will be used per
+        MPI rank, such that the total number of GPUs used is determined by
+        the number of MPI ranks. Therefore, each number higher than 0 will be
+        treated the same in inference.
+        """
         return self._use_gpu
 
     @use_gpu.setter