From a063b0d9f6e61dfe596cfdee419db22d46e214aa Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 21 Sep 2023 08:57:21 +0200 Subject: [PATCH 1/7] Enable more partitioners specification in more types --- datasets/flwr_datasets/federated_dataset.py | 9 +- .../flwr_datasets/federated_dataset_test.py | 90 +++++++++++++++++++ datasets/flwr_datasets/utils.py | 31 +++++-- 3 files changed, 122 insertions(+), 8 deletions(-) diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py index bfd7ceff70b1..398812750c81 100644 --- a/datasets/flwr_datasets/federated_dataset.py +++ b/datasets/flwr_datasets/federated_dataset.py @@ -15,7 +15,7 @@ """FederatedDataset.""" -from typing import Dict, Optional +from typing import Dict, Optional, Union import datasets from datasets import Dataset, DatasetDict @@ -53,7 +53,12 @@ class FederatedDataset: >>> centralized = mnist_fds.load_full("test") """ - def __init__(self, *, dataset: str, partitioners: Dict[str, int]) -> None: + def __init__( + self, + *, + dataset: str, + partitioners: Union[int, Partitioner, Dict[str, int], Dict[str, Partitioner]], + ) -> None: _check_if_dataset_supported(dataset) self._dataset_name: str = dataset self._partitioners: Dict[str, Partitioner] = _instantiate_partitioners( diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py index 467f9d856ed4..335bb231de7f 100644 --- a/datasets/flwr_datasets/federated_dataset_test.py +++ b/datasets/flwr_datasets/federated_dataset_test.py @@ -13,15 +13,18 @@ # limitations under the License. # ============================================================================== """Federated Dataset tests.""" +# pylint: disable=W0212, C0103 import unittest +from typing import Dict import pytest from parameterized import parameterized, parameterized_class import datasets from flwr_datasets.federated_dataset import FederatedDataset +from flwr_datasets.partitioner import IidPartitioner, Partitioner @parameterized_class( @@ -91,6 +94,93 @@ def test_multiple_partitioners(self) -> None: ) +class PartitionersSpecificationForFederatedDatasets(unittest.TestCase): + """Test the specifications of partitioners for FederatedDatasets.""" + + dataset_name = "cifar10" + test_split = "test" + + def test_partitioner_as_int_produces_dict_of_partitioners(self) -> None: + """Test specifying partitioner as an integer creates a dictionary.""" + num_train_partitions = 100 + fds = FederatedDataset( + dataset=self.dataset_name, + partitioners=num_train_partitions, + ) + self.assertIsInstance(fds._partitioners, dict) + + def test_partitioner_as_int_produces_single_partitioner_for_train_data( + self, + ) -> None: + """Test if int partitioner creates a single partitioner for the train set.""" + num_train_partitions = 100 + fds = FederatedDataset( + dataset=self.dataset_name, + partitioners=num_train_partitions, + ) + self.assertTrue(len(fds._partitioners) == 1 and "train" in fds._partitioners) + + def test_partitioner_as_int_produces_IidPartitioner(self) -> None: + """Test if int partitioner creates IidPartitioner for the training set.""" + num_train_partitions = 100 + fds = FederatedDataset( + dataset=self.dataset_name, + partitioners=num_train_partitions, + ) + self.assertIsInstance(fds._partitioners["train"], IidPartitioner) + + def test_single_partitioner_gets_assigned_to_train_set(self) -> None: + """Test if IidPartitioner gets assigned to the train data set.""" + num_train_partitions = 100 + iid_partitioner = IidPartitioner(num_partitions=num_train_partitions) + fds = FederatedDataset( + dataset=self.dataset_name, + partitioners=iid_partitioner, + ) + self.assertTrue(len(fds._partitioners) == 1 and "train" in fds._partitioners) + + def test_single_partitioner_given_is_the_same_object(self) -> None: + """Test if single partitioner is passed directly.""" + num_train_partitions = 100 + iid_partitioner = IidPartitioner(num_partitions=num_train_partitions) + fds = FederatedDataset( + dataset=self.dataset_name, + partitioners=iid_partitioner, + ) + self.assertIs(fds._partitioners["train"], iid_partitioner) + + def test_dict_of_partitioners_given_is_the_same_in_FederatedDataset(self) -> None: + """Test if a dictionary of partitioners is passed directly.""" + num_train_partitions = 100 + num_test_partitions = 100 + partitioners: Dict[str, Partitioner] = { + "train": IidPartitioner(num_partitions=num_train_partitions), + "test": IidPartitioner(num_partitions=num_test_partitions), + } + fds = FederatedDataset( + dataset=self.dataset_name, + partitioners=partitioners, + ) + self.assertIs(fds._partitioners, partitioners) + + def test_dict_str_int_produces_correct_partitioners(self) -> None: + """Test if dict partitioners have the same keys.""" + num_train_partitions = 100 + num_test_partitions = 100 + fds = FederatedDataset( + dataset=self.dataset_name, + partitioners={ + "train": num_train_partitions, + "test": num_test_partitions, + }, + ) + self.assertTrue( + len(fds._partitioners) == 2 + and "train" in fds._partitioners + and "test" in fds._partitioners + ) + + class IncorrectUsageFederatedDatasets(unittest.TestCase): """Test incorrect usages in FederatedDatasets.""" diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py index 734594b77033..be025f987020 100644 --- a/datasets/flwr_datasets/utils.py +++ b/datasets/flwr_datasets/utils.py @@ -15,12 +15,14 @@ """Utils for FederatedDataset.""" -from typing import Dict +from typing import Dict, Union, cast from flwr_datasets.partitioner import IidPartitioner, Partitioner -def _instantiate_partitioners(partitioners: Dict[str, int]) -> Dict[str, Partitioner]: +def _instantiate_partitioners( + partitioners: Union[int, Partitioner, Dict[str, int], Dict[str, Partitioner]] +) -> Dict[str, Partitioner]: """Transform the partitioners from the initial format to instantiated objects. Parameters @@ -34,10 +36,27 @@ def _instantiate_partitioners(partitioners: Dict[str, int]) -> Dict[str, Partiti Partitioners specified as split to Partitioner object. """ instantiated_partitioners: Dict[str, Partitioner] = {} - for split_name, num_partitions in partitioners.items(): - instantiated_partitioners[split_name] = IidPartitioner( - num_partitions=num_partitions - ) + if isinstance(partitioners, int): + # We assume that the int regards IidPartitioner specified for the train set + instantiated_partitioners["train"] = IidPartitioner(num_partitions=partitioners) + elif isinstance(partitioners, Partitioner): + # We assume that the Partitioner was specified for the train set + instantiated_partitioners["train"] = partitioners + elif isinstance(partitioners, Dict): + # dict_first_value = list(partitioners.values())[0] + # Dict[str, Partitioner] + if all(isinstance(val, Partitioner) for val in partitioners.values()): + # No need to do anything + instantiated_partitioners = cast(Dict[str, Partitioner], partitioners) + # Dict[str, int] + elif all(isinstance(val, int) for val in partitioners.values()): + for split_name, num_partitions in partitioners.items(): + assert isinstance(num_partitions, int) + instantiated_partitioners[split_name] = IidPartitioner( + num_partitions=num_partitions + ) + else: + raise ValueError("Incorrect type of the 'partitioners' encountered.") return instantiated_partitioners From ae10174447605bf7d5f1b1b3a5d64beb3c2db781 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 12 Oct 2023 11:19:48 +0200 Subject: [PATCH 2/7] Remove the int and Partitioner type --- datasets/flwr_datasets/federated_dataset.py | 6 +-- datasets/flwr_datasets/utils.py | 46 ++++++++++----------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py index 9449ddbe745a..1f98f84f9630 100644 --- a/datasets/flwr_datasets/federated_dataset.py +++ b/datasets/flwr_datasets/federated_dataset.py @@ -35,8 +35,8 @@ class FederatedDataset: ---------- dataset: str The name of the dataset in the Hugging Face Hub. - partitioners: Dict[str, int] - Dataset split to the number of IID partitions. + partitioners: Dict[str, Union[Partitioner, int]] + Dataset split to the Partitioner or a number of IID partitions. Examples -------- @@ -57,7 +57,7 @@ def __init__( self, *, dataset: str, - partitioners: Union[int, Partitioner, Dict[str, int], Dict[str, Partitioner]], + partitioners: Dict[str, Union[Partitioner, int]], ) -> None: _check_if_dataset_tested(dataset) self._dataset_name: str = dataset diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py index 892b96d43452..c73e9b47a162 100644 --- a/datasets/flwr_datasets/utils.py +++ b/datasets/flwr_datasets/utils.py @@ -16,7 +16,7 @@ import warnings -from typing import Dict, Union, cast +from typing import Dict, Union from flwr_datasets.partitioner import IidPartitioner, Partitioner @@ -30,14 +30,14 @@ def _instantiate_partitioners( - partitioners: Union[int, Partitioner, Dict[str, int], Dict[str, Partitioner]] + partitioners: Dict[str, Union[Partitioner, int]] ) -> Dict[str, Partitioner]: """Transform the partitioners from the initial format to instantiated objects. Parameters ---------- - partitioners: Dict[str, int] - Partitioners specified as split to the number of partitions format. + partitioners: Dict[str, Union[Partitioner, int]] + Dataset split to the Partitioner or a number of IID partitions. Returns ------- @@ -45,27 +45,25 @@ def _instantiate_partitioners( Partitioners specified as split to Partitioner object. """ instantiated_partitioners: Dict[str, Partitioner] = {} - if isinstance(partitioners, int): - # We assume that the int regards IidPartitioner specified for the train set - instantiated_partitioners["train"] = IidPartitioner(num_partitions=partitioners) - elif isinstance(partitioners, Partitioner): - # We assume that the Partitioner was specified for the train set - instantiated_partitioners["train"] = partitioners - elif isinstance(partitioners, Dict): - # dict_first_value = list(partitioners.values())[0] - # Dict[str, Partitioner] - if all(isinstance(val, Partitioner) for val in partitioners.values()): - # No need to do anything - instantiated_partitioners = cast(Dict[str, Partitioner], partitioners) - # Dict[str, int] - elif all(isinstance(val, int) for val in partitioners.values()): - for split_name, num_partitions in partitioners.items(): - assert isinstance(num_partitions, int) - instantiated_partitioners[split_name] = IidPartitioner( - num_partitions=num_partitions + if isinstance(partitioners, Dict): + for split, partitioner in partitioners.items(): + if isinstance(partitioner, Partitioner): + instantiated_partitioners[split] = partitioner + elif isinstance(partitioner, int): + instantiated_partitioners[split] = IidPartitioner( + num_partitions=partitioner ) - else: - raise ValueError("Incorrect type of the 'partitioners' encountered.") + else: + raise ValueError( + f"Incorrect type of the 'partitioners' value encountered. " + f"Expected Partitioner or int. Given {type(partitioner)}" + ) + else: + raise ValueError( + f"Incorrect type of the 'partitioners' encountered. " + f"Expected Dict[str, Union[int, Partitioner]]. " + f"Given {type(partitioners)}." + ) return instantiated_partitioners From ca3087b3f817fd61511475dfcfd4987f398f4f24 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 12 Oct 2023 11:27:19 +0200 Subject: [PATCH 3/7] Add tests for mixed types dict values for partitioners --- .../flwr_datasets/federated_dataset_test.py | 89 ++++++++----------- 1 file changed, 37 insertions(+), 52 deletions(-) diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py index 5d6aee981a41..2d2fc86c5ed9 100644 --- a/datasets/flwr_datasets/federated_dataset_test.py +++ b/datasets/flwr_datasets/federated_dataset_test.py @@ -13,11 +13,11 @@ # limitations under the License. # ============================================================================== """Federated Dataset tests.""" -# pylint: disable=W0212, C0103 +# pylint: disable=W0212, C0103, C0206 import unittest -from typing import Dict +from typing import Dict, Union import pytest from parameterized import parameterized, parameterized_class @@ -100,84 +100,69 @@ class PartitionersSpecificationForFederatedDatasets(unittest.TestCase): dataset_name = "cifar10" test_split = "test" - def test_partitioner_as_int_produces_dict_of_partitioners(self) -> None: - """Test specifying partitioner as an integer creates a dictionary.""" - num_train_partitions = 100 - fds = FederatedDataset( - dataset=self.dataset_name, - partitioners=num_train_partitions, - ) - self.assertIsInstance(fds._partitioners, dict) - - def test_partitioner_as_int_produces_single_partitioner_for_train_data( - self, - ) -> None: - """Test if int partitioner creates a single partitioner for the train set.""" + def test_dict_of_partitioners_passes_partitioners(self) -> None: + """Test if partitioners are passed directly (no recreation).""" num_train_partitions = 100 + num_test_partitions = 100 + partitioners: Dict[str, Union[Partitioner, int]] = { + "train": IidPartitioner(num_partitions=num_train_partitions), + "test": IidPartitioner(num_partitions=num_test_partitions), + } fds = FederatedDataset( dataset=self.dataset_name, - partitioners=num_train_partitions, + partitioners=partitioners, ) - self.assertTrue(len(fds._partitioners) == 1 and "train" in fds._partitioners) - def test_partitioner_as_int_produces_IidPartitioner(self) -> None: - """Test if int partitioner creates IidPartitioner for the training set.""" - num_train_partitions = 100 - fds = FederatedDataset( - dataset=self.dataset_name, - partitioners=num_train_partitions, + self.assertTrue( + all(fds._partitioners[key] == partitioners[key] for key in partitioners) ) - self.assertIsInstance(fds._partitioners["train"], IidPartitioner) - def test_single_partitioner_gets_assigned_to_train_set(self) -> None: - """Test if IidPartitioner gets assigned to the train data set.""" + def test_dict_str_int_produces_correct_partitioners(self) -> None: + """Test if dict partitioners have the same keys.""" num_train_partitions = 100 - iid_partitioner = IidPartitioner(num_partitions=num_train_partitions) + num_test_partitions = 100 fds = FederatedDataset( dataset=self.dataset_name, - partitioners=iid_partitioner, + partitioners={ + "train": num_train_partitions, + "test": num_test_partitions, + }, ) - self.assertTrue(len(fds._partitioners) == 1 and "train" in fds._partitioners) - - def test_single_partitioner_given_is_the_same_object(self) -> None: - """Test if single partitioner is passed directly.""" - num_train_partitions = 100 - iid_partitioner = IidPartitioner(num_partitions=num_train_partitions) - fds = FederatedDataset( - dataset=self.dataset_name, - partitioners=iid_partitioner, + self.assertTrue( + len(fds._partitioners) == 2 + and "train" in fds._partitioners + and "test" in fds._partitioners ) - self.assertIs(fds._partitioners["train"], iid_partitioner) - def test_dict_of_partitioners_given_is_the_same_in_FederatedDataset(self) -> None: - """Test if a dictionary of partitioners is passed directly.""" + def test_mixed_type_partitioners_passes_instantiated_partitioners(self) -> None: + """Test if an instantiated partitioner is passed directly.""" num_train_partitions = 100 num_test_partitions = 100 - partitioners: Dict[str, Partitioner] = { + partitioners: Dict[str, Union[Partitioner, int]] = { "train": IidPartitioner(num_partitions=num_train_partitions), - "test": IidPartitioner(num_partitions=num_test_partitions), + "test": num_test_partitions, } fds = FederatedDataset( dataset=self.dataset_name, partitioners=partitioners, ) - self.assertIs(fds._partitioners, partitioners) + self.assertIs(fds._partitioners["train"], partitioners["train"]) - def test_dict_str_int_produces_correct_partitioners(self) -> None: - """Test if dict partitioners have the same keys.""" + def test_mixed_type_partitioners_creates_from_int(self) -> None: + """Test if an IidPartitioner partitioner is created.""" num_train_partitions = 100 num_test_partitions = 100 + partitioners: Dict[str, Union[Partitioner, int]] = { + "train": IidPartitioner(num_partitions=num_train_partitions), + "test": num_test_partitions, + } fds = FederatedDataset( dataset=self.dataset_name, - partitioners={ - "train": num_train_partitions, - "test": num_test_partitions, - }, + partitioners=partitioners, ) self.assertTrue( - len(fds._partitioners) == 2 - and "train" in fds._partitioners - and "test" in fds._partitioners + isinstance(fds._partitioners["test"], IidPartitioner) + and fds._partitioners["test"]._num_partitions == num_test_partitions ) From 867ae5dd725235ab126a2f8d1251fd378f3c7345 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Thu, 12 Oct 2023 13:12:33 +0200 Subject: [PATCH 4/7] Update description of the partitioner parameter Co-authored-by: Daniel J. Beutel --- datasets/flwr_datasets/federated_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py index 1f98f84f9630..6bf0d65d3829 100644 --- a/datasets/flwr_datasets/federated_dataset.py +++ b/datasets/flwr_datasets/federated_dataset.py @@ -36,7 +36,7 @@ class FederatedDataset: dataset: str The name of the dataset in the Hugging Face Hub. partitioners: Dict[str, Union[Partitioner, int]] - Dataset split to the Partitioner or a number of IID partitions. + A dictionary mapping the Dataset split (a `str`) to a `Partitioner` or an `int` (representing the number of IID partitions that this split should be partitioned into). Examples -------- From 094ca8d073d9db109ecab31f725d228f8af7f750 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Thu, 12 Oct 2023 13:13:34 +0200 Subject: [PATCH 5/7] Specify FederatedDatasets as `FederatedDataset` Co-authored-by: Daniel J. Beutel --- datasets/flwr_datasets/federated_dataset_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py index 2d2fc86c5ed9..15485dfa7a95 100644 --- a/datasets/flwr_datasets/federated_dataset_test.py +++ b/datasets/flwr_datasets/federated_dataset_test.py @@ -95,7 +95,7 @@ def test_multiple_partitioners(self) -> None: class PartitionersSpecificationForFederatedDatasets(unittest.TestCase): - """Test the specifications of partitioners for FederatedDatasets.""" + """Test the specifications of partitioners for `FederatedDataset`.""" dataset_name = "cifar10" test_split = "test" From 67fb89bbb1090fabf28700e535fb62019a86459d Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Thu, 12 Oct 2023 13:13:55 +0200 Subject: [PATCH 6/7] Fix spacing Co-authored-by: Daniel J. Beutel --- datasets/flwr_datasets/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py index c73e9b47a162..056a12442d75 100644 --- a/datasets/flwr_datasets/utils.py +++ b/datasets/flwr_datasets/utils.py @@ -36,7 +36,7 @@ def _instantiate_partitioners( Parameters ---------- - partitioners: Dict[str, Union[Partitioner, int]] + partitioners: Dict[str, Union[Partitioner, int]] Dataset split to the Partitioner or a number of IID partitions. Returns From fde9e9f19690689e2f8c218d137dd46e0f59ad7d Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 12 Oct 2023 13:15:40 +0200 Subject: [PATCH 7/7] Fix line lengths --- datasets/flwr_datasets/federated_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py index 6bf0d65d3829..8d171db2afa4 100644 --- a/datasets/flwr_datasets/federated_dataset.py +++ b/datasets/flwr_datasets/federated_dataset.py @@ -36,7 +36,9 @@ class FederatedDataset: dataset: str The name of the dataset in the Hugging Face Hub. partitioners: Dict[str, Union[Partitioner, int]] - A dictionary mapping the Dataset split (a `str`) to a `Partitioner` or an `int` (representing the number of IID partitions that this split should be partitioned into). + A dictionary mapping the Dataset split (a `str`) to a `Partitioner` or an `int` + (representing the number of IID partitions that this split should be partitioned + into). Examples --------