From a99d5a6761e435846e5943ba8668841d311a4a6d Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 21 Nov 2024 13:27:00 +0100
Subject: [PATCH 1/3] Trying to make OpenPMD interface more continuous

---
 mala/datahandling/data_shuffler.py |  77 ++++++-----------
 test/shuffling_test.py             | 128 ++++++++++++++---------------
 2 files changed, 89 insertions(+), 116 deletions(-)

diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py
index fe914559..5d836eff 100644
--- a/mala/datahandling/data_shuffler.py
+++ b/mala/datahandling/data_shuffler.py
@@ -550,62 +550,35 @@ def shuffle_snapshots(
             number_of_shuffled_snapshots = self.nr_snapshots
         number_of_new_snapshots = number_of_shuffled_snapshots
 
-        if snapshot_type == "openpmd":
-            import math
-            import functools
+        shuffled_gridsizes = snapshot_size_list // number_of_new_snapshots
 
-            specified_number_of_new_snapshots = number_of_new_snapshots
-            number_of_new_snapshots = functools.reduce(
-                math.gcd,
-                [
-                    snapshot.grid_dimension[0]
-                    for snapshot in self.parameters.snapshot_directories_list
-                ],
-                number_of_new_snapshots,
+        if np.any(
+            np.array(snapshot_size_list)
+            - (
+                (np.array(snapshot_size_list) // number_of_new_snapshots)
+                * number_of_new_snapshots
+            )
+            > 0
+        ):
+            number_of_data_points = int(
+                np.sum(shuffled_gridsizes) * number_of_new_snapshots
             )
-            if number_of_new_snapshots != specified_number_of_new_snapshots:
-                print(
-                    f"[openPMD shuffling] Reduced the number of output snapshots to "
-                    f"{number_of_new_snapshots} because of the dataset dimensions."
-                )
-            del specified_number_of_new_snapshots
-        elif snapshot_type == "numpy":
-            # Implement all of the below for OpenPMD later.
-            # We need to check if we need to reduce the overall grid size
-            # because the individual snapshots may not contain enough data
-            # points
-            shuffled_gridsizes = snapshot_size_list // number_of_new_snapshots
-
-            if np.any(
-                np.array(snapshot_size_list)
-                - (
-                    (np.array(snapshot_size_list) // number_of_new_snapshots)
-                    * number_of_new_snapshots
-                )
-                > 0
-            ):
-                number_of_data_points = int(
-                    np.sum(shuffled_gridsizes) * number_of_new_snapshots
-                )
 
-            self.data_points_to_remove = []
-            for i in range(0, self.nr_snapshots):
-                self.data_points_to_remove.append(
-                    snapshot_size_list[i]
-                    - shuffled_gridsizes[i] * number_of_new_snapshots
-                )
-            tot_points_missing = sum(self.data_points_to_remove)
-
-            if tot_points_missing > 0:
-                printout(
-                    "Warning: number of requested snapshots is not a divisor of",
-                    "the original grid sizes.\n",
-                    f"{tot_points_missing} / {number_of_data_points} data points",
-                    "will be left out of the shuffled snapshots.",
-                )
+        self.data_points_to_remove = []
+        for i in range(0, self.nr_snapshots):
+            self.data_points_to_remove.append(
+                snapshot_size_list[i]
+                - shuffled_gridsizes[i] * number_of_new_snapshots
+            )
+        tot_points_missing = sum(self.data_points_to_remove)
 
-        else:
-            raise Exception("Invalid snapshot type.")
+        if tot_points_missing > 0:
+            printout(
+                "Warning: number of requested snapshots is not a divisor of",
+                "the original grid sizes.\n",
+                f"{tot_points_missing} / {number_of_data_points} data points",
+                "will be left out of the shuffled snapshots.",
+            )
 
         shuffle_dimensions = [
             int(number_of_data_points / number_of_new_snapshots),
diff --git a/test/shuffling_test.py b/test/shuffling_test.py
index 1a4cb367..ffe6181b 100644
--- a/test/shuffling_test.py
+++ b/test/shuffling_test.py
@@ -50,70 +50,70 @@ def test_seed(self):
         new = np.load("Be_REshuffled1.out.npy")
         assert np.isclose(np.sum(np.abs(old - new)), 0.0, atol=accuracy)
 
-    # def test_seed_openpmd(self):
-    #     """
-    #     Test that the shuffling is handled correctly internally.
-    #
-    #     This function tests the shuffling for OpenPMD and confirms that
-    #     shuffling both from numpy and openpmd into openpmd always gives the
-    #     same results. The first shuffling shuffles from openpmd to openpmd
-    #     format, the second from numpy to openpmd.
-    #     """
-    #     test_parameters = mala.Parameters()
-    #     test_parameters.data.shuffling_seed = 1234
-    #     data_shuffler = mala.DataShuffler(test_parameters)
-    #
-    #     # Add a snapshot we want to use in to the list.
-    #     data_shuffler.add_snapshot(
-    #         "Be_snapshot0.in.h5",
-    #         data_path,
-    #         "Be_snapshot0.out.h5",
-    #         data_path,
-    #         snapshot_type="openpmd",
-    #     )
-    #     data_shuffler.add_snapshot(
-    #         "Be_snapshot1.in.h5",
-    #         data_path,
-    #         "Be_snapshot1.out.h5",
-    #         data_path,
-    #         snapshot_type="openpmd",
-    #     )
-    #
-    #     # After shuffling, these snapshots can be loaded as regular snapshots
-    #     # for lazily loaded training-
-    #     data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5")
-    #
-    #     test_parameters = mala.Parameters()
-    #     test_parameters.data.shuffling_seed = 1234
-    #     data_shuffler = mala.DataShuffler(test_parameters)
-    #
-    #     # Add a snapshot we want to use in to the list.
-    #     data_shuffler.add_snapshot(
-    #         "Be_snapshot0.in.npy",
-    #         data_path,
-    #         "Be_snapshot0.out.npy",
-    #         data_path,
-    #         snapshot_type="numpy",
-    #     )
-    #     data_shuffler.add_snapshot(
-    #         "Be_snapshot1.in.npy",
-    #         data_path,
-    #         "Be_snapshot1.out.npy",
-    #         data_path,
-    #         snapshot_type="numpy",
-    #     )
-    #
-    #     # After shuffling, these snapshots can be loaded as regular snapshots
-    #     # for lazily loaded training-
-    #     data_shuffler.shuffle_snapshots("./", save_name="Be_REshuffled*.h5")
-    #
-    #     old = data_shuffler.target_calculator.read_from_openpmd_file(
-    #         "Be_shuffled1.out.h5"
-    #     )
-    #     new = data_shuffler.target_calculator.read_from_openpmd_file(
-    #         "Be_REshuffled1.out.h5"
-    #     )
-    #     assert np.isclose(np.sum(np.abs(old - new)), 0.0, atol=accuracy)
+    def test_seed_openpmd(self):
+        """
+        Test that the shuffling is handled correctly internally.
+
+        This function tests the shuffling for OpenPMD and confirms that
+        shuffling both from numpy and openpmd into openpmd always gives the
+        same results. The first shuffling shuffles from openpmd to openpmd
+        format, the second from numpy to openpmd.
+        """
+        test_parameters = mala.Parameters()
+        test_parameters.data.shuffling_seed = 1234
+        data_shuffler = mala.DataShuffler(test_parameters)
+
+        # Add a snapshot we want to use in to the list.
+        data_shuffler.add_snapshot(
+            "Be_snapshot0.in.h5",
+            data_path,
+            "Be_snapshot0.out.h5",
+            data_path,
+            snapshot_type="openpmd",
+        )
+        data_shuffler.add_snapshot(
+            "Be_snapshot1.in.h5",
+            data_path,
+            "Be_snapshot1.out.h5",
+            data_path,
+            snapshot_type="openpmd",
+        )
+
+        # After shuffling, these snapshots can be loaded as regular snapshots
+        # for lazily loaded training-
+        data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5")
+
+        test_parameters = mala.Parameters()
+        test_parameters.data.shuffling_seed = 1234
+        data_shuffler = mala.DataShuffler(test_parameters)
+
+        # Add a snapshot we want to use in to the list.
+        data_shuffler.add_snapshot(
+            "Be_snapshot0.in.npy",
+            data_path,
+            "Be_snapshot0.out.npy",
+            data_path,
+            snapshot_type="numpy",
+        )
+        data_shuffler.add_snapshot(
+            "Be_snapshot1.in.npy",
+            data_path,
+            "Be_snapshot1.out.npy",
+            data_path,
+            snapshot_type="numpy",
+        )
+
+        # After shuffling, these snapshots can be loaded as regular snapshots
+        # for lazily loaded training-
+        data_shuffler.shuffle_snapshots("./", save_name="Be_REshuffled*.h5")
+
+        old = data_shuffler.target_calculator.read_from_openpmd_file(
+            "Be_shuffled1.out.h5"
+        )
+        new = data_shuffler.target_calculator.read_from_openpmd_file(
+            "Be_REshuffled1.out.h5"
+        )
+        assert np.isclose(np.sum(np.abs(old - new)), 0.0, atol=accuracy)
 
     def test_training(self):
         test_parameters = mala.Parameters()

From 4697216346d437c4d3c785a1888b5a96d0979989 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 21 Nov 2024 15:18:00 +0100
Subject: [PATCH 2/3] Fixed the inconsistency between numpy and openPMD and
 added Exception for trying to use OpenPMD with the wrong number of snapshots

---
 mala/datahandling/data_shuffler.py | 46 +++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py
index 5d836eff..55074c1d 100644
--- a/mala/datahandling/data_shuffler.py
+++ b/mala/datahandling/data_shuffler.py
@@ -134,7 +134,10 @@ def __shuffle_numpy(
             # if the number of new snapshots is not a divisor of the grid size
             # then we have to trim the original snapshots to size
             # the indicies to be removed are selected at random
-            if self.data_points_to_remove is not None:
+            if (
+                self.data_points_to_remove is not None
+                and np.sum(self.data_points_to_remove) > 0
+            ):
                 if self.parameters.shuffling_seed is not None:
                     np.random.seed(idx * self.parameters.shuffling_seed)
                 ngrid = (
@@ -548,27 +551,44 @@ def shuffle_snapshots(
         self.data_points_to_remove = None
         if number_of_shuffled_snapshots is None:
             number_of_shuffled_snapshots = self.nr_snapshots
-        number_of_new_snapshots = number_of_shuffled_snapshots
 
-        shuffled_gridsizes = snapshot_size_list // number_of_new_snapshots
+        # Currently, the openPMD interface is not feature-complete.
+        if np.any(
+            np.array(
+                [
+                    snapshot.grid_dimension[0] % number_of_shuffled_snapshots
+                    for snapshot in self.parameters.snapshot_directories_list
+                ]
+            )
+            != 0
+        ):
+            raise ValueError(
+                "Shuffling from OpenPMD files currently only "
+                "supported if first dimension of all snapshots "
+                "can evenly be divided by number of snapshots. "
+                "Please select a different number of shuffled "
+                "snapshots or use the numpy interface. "
+            )
+
+        shuffled_gridsizes = snapshot_size_list // number_of_shuffled_snapshots
 
         if np.any(
             np.array(snapshot_size_list)
             - (
-                (np.array(snapshot_size_list) // number_of_new_snapshots)
-                * number_of_new_snapshots
+                (np.array(snapshot_size_list) // number_of_shuffled_snapshots)
+                * number_of_shuffled_snapshots
             )
             > 0
         ):
             number_of_data_points = int(
-                np.sum(shuffled_gridsizes) * number_of_new_snapshots
+                np.sum(shuffled_gridsizes) * number_of_shuffled_snapshots
             )
 
         self.data_points_to_remove = []
         for i in range(0, self.nr_snapshots):
             self.data_points_to_remove.append(
                 snapshot_size_list[i]
-                - shuffled_gridsizes[i] * number_of_new_snapshots
+                - shuffled_gridsizes[i] * number_of_shuffled_snapshots
             )
         tot_points_missing = sum(self.data_points_to_remove)
 
@@ -581,14 +601,14 @@ def shuffle_snapshots(
             )
 
         shuffle_dimensions = [
-            int(number_of_data_points / number_of_new_snapshots),
+            int(number_of_data_points / number_of_shuffled_snapshots),
             1,
             1,
         ]
 
         printout(
             "Data shuffler will generate",
-            number_of_new_snapshots,
+            number_of_shuffled_snapshots,
             "new snapshots.",
         )
         printout("Shuffled snapshot dimension will be ", shuffle_dimensions)
@@ -596,7 +616,7 @@ def shuffle_snapshots(
         # Prepare permutations.
         permutations = []
         seeds = []
-        for i in range(0, number_of_new_snapshots):
+        for i in range(0, number_of_shuffled_snapshots):
             # This makes the shuffling deterministic, if specified by the user.
             if self.parameters.shuffling_seed is not None:
                 np.random.seed(i * self.parameters.shuffling_seed)
@@ -606,7 +626,7 @@ def shuffle_snapshots(
 
         if snapshot_type == "numpy":
             self.__shuffle_numpy(
-                number_of_new_snapshots,
+                number_of_shuffled_snapshots,
                 shuffle_dimensions,
                 descriptor_save_path,
                 save_name,
@@ -625,7 +645,7 @@ def shuffle_snapshots(
             )
             self.__shuffle_openpmd(
                 descriptor,
-                number_of_new_snapshots,
+                number_of_shuffled_snapshots,
                 shuffle_dimensions,
                 save_name,
                 permutations,
@@ -641,7 +661,7 @@ def shuffle_snapshots(
             )
             self.__shuffle_openpmd(
                 target,
-                number_of_new_snapshots,
+                number_of_shuffled_snapshots,
                 shuffle_dimensions,
                 save_name,
                 permutations,

From 3d4ee9e3a88a173320113ca444e12b3796941a49 Mon Sep 17 00:00:00 2001
From: Lenz Fiedler <l.fiedler@hzdr.de>
Date: Thu, 21 Nov 2024 17:18:34 +0100
Subject: [PATCH 3/3] Snapshot check should only be performed if OpenPMD is
 selected

---
 mala/datahandling/data_shuffler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py
index 55074c1d..c3f71644 100644
--- a/mala/datahandling/data_shuffler.py
+++ b/mala/datahandling/data_shuffler.py
@@ -553,7 +553,7 @@ def shuffle_snapshots(
             number_of_shuffled_snapshots = self.nr_snapshots
 
         # Currently, the openPMD interface is not feature-complete.
-        if np.any(
+        if snapshot_type == "openpmd" and np.any(
             np.array(
                 [
                     snapshot.grid_dimension[0] % number_of_shuffled_snapshots