From cd1a6965198d21d91f5470bc40a29c04f9d54df9 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Fri, 22 Dec 2023 09:50:17 +0100
Subject: [PATCH 1/2] Targeted correct device for CUDA synchronize

---
 mala/network/trainer.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mala/network/trainer.py b/mala/network/trainer.py
index 98dc291b8..bc3cfc544 100644
--- a/mala/network/trainer.py
+++ b/mala/network/trainer.py
@@ -279,7 +279,7 @@ def train_network(self):
                 self.data.training_data_sets[0].shuffle()
 
             if self.parameters._configuration["gpu"]:
-                torch.cuda.synchronize()
+                torch.cuda.synchronize(self.parameters._configuration["device"])
                 tsample = time.time()
                 t0 = time.time()
                 batchid = 0
@@ -309,7 +309,7 @@ def train_network(self):
                         training_loss_sum += loss
 
                         if batchid != 0 and (batchid + 1) % self.parameters.training_report_frequency == 0:
-                            torch.cuda.synchronize()
+                            torch.cuda.synchronize(self.parameters._configuration["device"])
                             sample_time = time.time() - tsample
                             avg_sample_time = sample_time / self.parameters.training_report_frequency
                             avg_sample_tput = self.parameters.training_report_frequency * inputs.shape[0] / sample_time
@@ -319,14 +319,14 @@ def train_network(self):
                                      min_verbosity=2)
                             tsample = time.time()
                         batchid += 1
-                torch.cuda.synchronize()
+                torch.cuda.synchronize(self.parameters._configuration["device"])
                 t1 = time.time()
                 printout(f"training time: {t1 - t0}", min_verbosity=2)
 
                 training_loss = training_loss_sum.item() / batchid
 
                 # Calculate the validation loss. and output it.
-                torch.cuda.synchronize()
+                torch.cuda.synchronize(self.parameters._configuration["device"])
             else:
                 batchid = 0
                 for loader in self.training_data_loaders:
@@ -375,14 +375,14 @@ def train_network(self):
                 self.tensor_board.close()
 
             if self.parameters._configuration["gpu"]:
-                torch.cuda.synchronize()
+                torch.cuda.synchronize(self.parameters._configuration["device"])
 
             # Mix the DataSets up (this function only does something
             # in the lazy loading case).
             if self.parameters.use_shuffling_for_samplers:
                 self.data.mix_datasets()
             if self.parameters._configuration["gpu"]:
-                torch.cuda.synchronize()
+                torch.cuda.synchronize(self.parameters._configuration["device"])
 
             # If a scheduler is used, update it.
             if self.scheduler is not None:
@@ -742,7 +742,7 @@ def __validate_network(self, network, data_set_type, validation_type):
             with torch.no_grad():
                 if self.parameters._configuration["gpu"]:
                     report_freq = self.parameters.training_report_frequency
-                    torch.cuda.synchronize()
+                    torch.cuda.synchronize(self.parameters._configuration["device"])
                     tsample = time.time()
                     batchid = 0
                     for loader in data_loaders:
@@ -786,7 +786,7 @@ def __validate_network(self, network, data_set_type, validation_type):
                                     loss = network.calculate_loss(prediction, y)
                                     validation_loss_sum += loss
                             if batchid != 0 and (batchid + 1) % report_freq == 0:
-                                torch.cuda.synchronize()
+                                torch.cuda.synchronize(self.parameters._configuration["device"])
                                 sample_time = time.time() - tsample
                                 avg_sample_time = sample_time / report_freq
                                 avg_sample_tput = report_freq * x.shape[0] / sample_time
@@ -796,7 +796,7 @@ def __validate_network(self, network, data_set_type, validation_type):
                                          min_verbosity=2)
                                 tsample = time.time()
                             batchid += 1
-                    torch.cuda.synchronize()
+                    torch.cuda.synchronize(self.parameters._configuration["device"])
                 else:
                     batchid = 0
                     for loader in data_loaders:

From 45f074943fcdb45e139599bf735a148c34a4e7d0 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Fri, 22 Dec 2023 10:08:49 +0100
Subject: [PATCH 2/2] Also included the device for stream operations, for good
 measure

---
 mala/network/trainer.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mala/network/trainer.py b/mala/network/trainer.py
index bc3cfc544..0fafb67be 100644
--- a/mala/network/trainer.py
+++ b/mala/network/trainer.py
@@ -636,8 +636,8 @@ def __process_mini_batch(self, network, input_data, target_data):
         if self.parameters._configuration["gpu"]:
             if self.parameters.use_graphs and self.train_graph is None:
                 printout("Capturing CUDA graph for training.", min_verbosity=2)
-                s = torch.cuda.Stream()
-                s.wait_stream(torch.cuda.current_stream())
+                s = torch.cuda.Stream(self.parameters._configuration["device"])
+                s.wait_stream(torch.cuda.current_stream(self.parameters._configuration["device"]))
                 # Warmup for graphs
                 with torch.cuda.stream(s):
                     for _ in range(20):
@@ -651,7 +651,7 @@ def __process_mini_batch(self, network, input_data, target_data):
                             self.gradscaler.scale(loss).backward()
                         else:
                             loss.backward()
-                torch.cuda.current_stream().wait_stream(s)
+                torch.cuda.current_stream(self.parameters._configuration["device"]).wait_stream(s)
 
                 # Create static entry point tensors to graph
                 self.static_input_data = torch.empty_like(input_data)
@@ -754,15 +754,15 @@ def __validate_network(self, network, data_set_type, validation_type):
 
                             if self.parameters.use_graphs and self.validation_graph is None:
                                 printout("Capturing CUDA graph for validation.", min_verbosity=2)
-                                s = torch.cuda.Stream()
-                                s.wait_stream(torch.cuda.current_stream())
+                                s = torch.cuda.Stream(self.parameters._configuration["device"])
+                                s.wait_stream(torch.cuda.current_stream(self.parameters._configuration["device"]))
                                 # Warmup for graphs
                                 with torch.cuda.stream(s):
                                     for _ in range(20):
                                         with torch.cuda.amp.autocast(enabled=self.parameters.use_mixed_precision):
                                             prediction = network(x)
                                             loss = network.calculate_loss(prediction, y)
-                                torch.cuda.current_stream().wait_stream(s)
+                                torch.cuda.current_stream(self.parameters._configuration["device"]).wait_stream(s)
 
                                 # Create static entry point tensors to graph
                                 self.static_input_validation = torch.empty_like(x)