From a99d5a6761e435846e5943ba8668841d311a4a6d Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 21 Nov 2024 13:27:00 +0100 Subject: [PATCH 1/3] Trying to make OpenPMD interface more continuous --- mala/datahandling/data_shuffler.py | 77 ++++++----------- test/shuffling_test.py | 128 ++++++++++++++--------------- 2 files changed, 89 insertions(+), 116 deletions(-) diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index fe914559..5d836eff 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -550,62 +550,35 @@ def shuffle_snapshots( number_of_shuffled_snapshots = self.nr_snapshots number_of_new_snapshots = number_of_shuffled_snapshots - if snapshot_type == "openpmd": - import math - import functools + shuffled_gridsizes = snapshot_size_list // number_of_new_snapshots - specified_number_of_new_snapshots = number_of_new_snapshots - number_of_new_snapshots = functools.reduce( - math.gcd, - [ - snapshot.grid_dimension[0] - for snapshot in self.parameters.snapshot_directories_list - ], - number_of_new_snapshots, + if np.any( + np.array(snapshot_size_list) + - ( + (np.array(snapshot_size_list) // number_of_new_snapshots) + * number_of_new_snapshots + ) + > 0 + ): + number_of_data_points = int( + np.sum(shuffled_gridsizes) * number_of_new_snapshots ) - if number_of_new_snapshots != specified_number_of_new_snapshots: - print( - f"[openPMD shuffling] Reduced the number of output snapshots to " - f"{number_of_new_snapshots} because of the dataset dimensions." - ) - del specified_number_of_new_snapshots - elif snapshot_type == "numpy": - # Implement all of the below for OpenPMD later. - # We need to check if we need to reduce the overall grid size - # because the individual snapshots may not contain enough data - # points - shuffled_gridsizes = snapshot_size_list // number_of_new_snapshots - - if np.any( - np.array(snapshot_size_list) - - ( - (np.array(snapshot_size_list) // number_of_new_snapshots) - * number_of_new_snapshots - ) - > 0 - ): - number_of_data_points = int( - np.sum(shuffled_gridsizes) * number_of_new_snapshots - ) - self.data_points_to_remove = [] - for i in range(0, self.nr_snapshots): - self.data_points_to_remove.append( - snapshot_size_list[i] - - shuffled_gridsizes[i] * number_of_new_snapshots - ) - tot_points_missing = sum(self.data_points_to_remove) - - if tot_points_missing > 0: - printout( - "Warning: number of requested snapshots is not a divisor of", - "the original grid sizes.\n", - f"{tot_points_missing} / {number_of_data_points} data points", - "will be left out of the shuffled snapshots.", - ) + self.data_points_to_remove = [] + for i in range(0, self.nr_snapshots): + self.data_points_to_remove.append( + snapshot_size_list[i] + - shuffled_gridsizes[i] * number_of_new_snapshots + ) + tot_points_missing = sum(self.data_points_to_remove) - else: - raise Exception("Invalid snapshot type.") + if tot_points_missing > 0: + printout( + "Warning: number of requested snapshots is not a divisor of", + "the original grid sizes.\n", + f"{tot_points_missing} / {number_of_data_points} data points", + "will be left out of the shuffled snapshots.", + ) shuffle_dimensions = [ int(number_of_data_points / number_of_new_snapshots), diff --git a/test/shuffling_test.py b/test/shuffling_test.py index 1a4cb367..ffe6181b 100644 --- a/test/shuffling_test.py +++ b/test/shuffling_test.py @@ -50,70 +50,70 @@ def test_seed(self): new = np.load("Be_REshuffled1.out.npy") assert np.isclose(np.sum(np.abs(old - new)), 0.0, atol=accuracy) - # def test_seed_openpmd(self): - # """ - # Test that the shuffling is handled correctly internally. - # - # This function tests the shuffling for OpenPMD and confirms that - # shuffling both from numpy and openpmd into openpmd always gives the - # same results. The first shuffling shuffles from openpmd to openpmd - # format, the second from numpy to openpmd. - # """ - # test_parameters = mala.Parameters() - # test_parameters.data.shuffling_seed = 1234 - # data_shuffler = mala.DataShuffler(test_parameters) - # - # # Add a snapshot we want to use in to the list. - # data_shuffler.add_snapshot( - # "Be_snapshot0.in.h5", - # data_path, - # "Be_snapshot0.out.h5", - # data_path, - # snapshot_type="openpmd", - # ) - # data_shuffler.add_snapshot( - # "Be_snapshot1.in.h5", - # data_path, - # "Be_snapshot1.out.h5", - # data_path, - # snapshot_type="openpmd", - # ) - # - # # After shuffling, these snapshots can be loaded as regular snapshots - # # for lazily loaded training- - # data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5") - # - # test_parameters = mala.Parameters() - # test_parameters.data.shuffling_seed = 1234 - # data_shuffler = mala.DataShuffler(test_parameters) - # - # # Add a snapshot we want to use in to the list. - # data_shuffler.add_snapshot( - # "Be_snapshot0.in.npy", - # data_path, - # "Be_snapshot0.out.npy", - # data_path, - # snapshot_type="numpy", - # ) - # data_shuffler.add_snapshot( - # "Be_snapshot1.in.npy", - # data_path, - # "Be_snapshot1.out.npy", - # data_path, - # snapshot_type="numpy", - # ) - # - # # After shuffling, these snapshots can be loaded as regular snapshots - # # for lazily loaded training- - # data_shuffler.shuffle_snapshots("./", save_name="Be_REshuffled*.h5") - # - # old = data_shuffler.target_calculator.read_from_openpmd_file( - # "Be_shuffled1.out.h5" - # ) - # new = data_shuffler.target_calculator.read_from_openpmd_file( - # "Be_REshuffled1.out.h5" - # ) - # assert np.isclose(np.sum(np.abs(old - new)), 0.0, atol=accuracy) + def test_seed_openpmd(self): + """ + Test that the shuffling is handled correctly internally. + + This function tests the shuffling for OpenPMD and confirms that + shuffling both from numpy and openpmd into openpmd always gives the + same results. The first shuffling shuffles from openpmd to openpmd + format, the second from numpy to openpmd. + """ + test_parameters = mala.Parameters() + test_parameters.data.shuffling_seed = 1234 + data_shuffler = mala.DataShuffler(test_parameters) + + # Add a snapshot we want to use in to the list. + data_shuffler.add_snapshot( + "Be_snapshot0.in.h5", + data_path, + "Be_snapshot0.out.h5", + data_path, + snapshot_type="openpmd", + ) + data_shuffler.add_snapshot( + "Be_snapshot1.in.h5", + data_path, + "Be_snapshot1.out.h5", + data_path, + snapshot_type="openpmd", + ) + + # After shuffling, these snapshots can be loaded as regular snapshots + # for lazily loaded training- + data_shuffler.shuffle_snapshots("./", save_name="Be_shuffled*.h5") + + test_parameters = mala.Parameters() + test_parameters.data.shuffling_seed = 1234 + data_shuffler = mala.DataShuffler(test_parameters) + + # Add a snapshot we want to use in to the list. + data_shuffler.add_snapshot( + "Be_snapshot0.in.npy", + data_path, + "Be_snapshot0.out.npy", + data_path, + snapshot_type="numpy", + ) + data_shuffler.add_snapshot( + "Be_snapshot1.in.npy", + data_path, + "Be_snapshot1.out.npy", + data_path, + snapshot_type="numpy", + ) + + # After shuffling, these snapshots can be loaded as regular snapshots + # for lazily loaded training- + data_shuffler.shuffle_snapshots("./", save_name="Be_REshuffled*.h5") + + old = data_shuffler.target_calculator.read_from_openpmd_file( + "Be_shuffled1.out.h5" + ) + new = data_shuffler.target_calculator.read_from_openpmd_file( + "Be_REshuffled1.out.h5" + ) + assert np.isclose(np.sum(np.abs(old - new)), 0.0, atol=accuracy) def test_training(self): test_parameters = mala.Parameters() From 4697216346d437c4d3c785a1888b5a96d0979989 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 21 Nov 2024 15:18:00 +0100 Subject: [PATCH 2/3] Fixed the inconsistency between numpy and openPMD and added Exception for trying to use OpenPMD with the wrong number of snapshots --- mala/datahandling/data_shuffler.py | 46 +++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 5d836eff..55074c1d 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -134,7 +134,10 @@ def __shuffle_numpy( # if the number of new snapshots is not a divisor of the grid size # then we have to trim the original snapshots to size # the indicies to be removed are selected at random - if self.data_points_to_remove is not None: + if ( + self.data_points_to_remove is not None + and np.sum(self.data_points_to_remove) > 0 + ): if self.parameters.shuffling_seed is not None: np.random.seed(idx * self.parameters.shuffling_seed) ngrid = ( @@ -548,27 +551,44 @@ def shuffle_snapshots( self.data_points_to_remove = None if number_of_shuffled_snapshots is None: number_of_shuffled_snapshots = self.nr_snapshots - number_of_new_snapshots = number_of_shuffled_snapshots - shuffled_gridsizes = snapshot_size_list // number_of_new_snapshots + # Currently, the openPMD interface is not feature-complete. + if np.any( + np.array( + [ + snapshot.grid_dimension[0] % number_of_shuffled_snapshots + for snapshot in self.parameters.snapshot_directories_list + ] + ) + != 0 + ): + raise ValueError( + "Shuffling from OpenPMD files currently only " + "supported if first dimension of all snapshots " + "can evenly be divided by number of snapshots. " + "Please select a different number of shuffled " + "snapshots or use the numpy interface. " + ) + + shuffled_gridsizes = snapshot_size_list // number_of_shuffled_snapshots if np.any( np.array(snapshot_size_list) - ( - (np.array(snapshot_size_list) // number_of_new_snapshots) - * number_of_new_snapshots + (np.array(snapshot_size_list) // number_of_shuffled_snapshots) + * number_of_shuffled_snapshots ) > 0 ): number_of_data_points = int( - np.sum(shuffled_gridsizes) * number_of_new_snapshots + np.sum(shuffled_gridsizes) * number_of_shuffled_snapshots ) self.data_points_to_remove = [] for i in range(0, self.nr_snapshots): self.data_points_to_remove.append( snapshot_size_list[i] - - shuffled_gridsizes[i] * number_of_new_snapshots + - shuffled_gridsizes[i] * number_of_shuffled_snapshots ) tot_points_missing = sum(self.data_points_to_remove) @@ -581,14 +601,14 @@ def shuffle_snapshots( ) shuffle_dimensions = [ - int(number_of_data_points / number_of_new_snapshots), + int(number_of_data_points / number_of_shuffled_snapshots), 1, 1, ] printout( "Data shuffler will generate", - number_of_new_snapshots, + number_of_shuffled_snapshots, "new snapshots.", ) printout("Shuffled snapshot dimension will be ", shuffle_dimensions) @@ -596,7 +616,7 @@ def shuffle_snapshots( # Prepare permutations. permutations = [] seeds = [] - for i in range(0, number_of_new_snapshots): + for i in range(0, number_of_shuffled_snapshots): # This makes the shuffling deterministic, if specified by the user. if self.parameters.shuffling_seed is not None: np.random.seed(i * self.parameters.shuffling_seed) @@ -606,7 +626,7 @@ def shuffle_snapshots( if snapshot_type == "numpy": self.__shuffle_numpy( - number_of_new_snapshots, + number_of_shuffled_snapshots, shuffle_dimensions, descriptor_save_path, save_name, @@ -625,7 +645,7 @@ def shuffle_snapshots( ) self.__shuffle_openpmd( descriptor, - number_of_new_snapshots, + number_of_shuffled_snapshots, shuffle_dimensions, save_name, permutations, @@ -641,7 +661,7 @@ def shuffle_snapshots( ) self.__shuffle_openpmd( target, - number_of_new_snapshots, + number_of_shuffled_snapshots, shuffle_dimensions, save_name, permutations, From 3d4ee9e3a88a173320113ca444e12b3796941a49 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 21 Nov 2024 17:18:34 +0100 Subject: [PATCH 3/3] Snapshot check should only be performed if OpenPMD is selected --- mala/datahandling/data_shuffler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 55074c1d..c3f71644 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -553,7 +553,7 @@ def shuffle_snapshots( number_of_shuffled_snapshots = self.nr_snapshots # Currently, the openPMD interface is not feature-complete. - if np.any( + if snapshot_type == "openpmd" and np.any( np.array( [ snapshot.grid_dimension[0] % number_of_shuffled_snapshots