From dd02021739ce7ab51702379790a6780123db62e7 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 23 Oct 2023 16:40:25 +0200 Subject: [PATCH 01/17] Add size partitioner --- .../partitioner/size_partitioner.py | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 datasets/flwr_datasets/partitioner/size_partitioner.py diff --git a/datasets/flwr_datasets/partitioner/size_partitioner.py b/datasets/flwr_datasets/partitioner/size_partitioner.py new file mode 100644 index 000000000000..e471a6836c7e --- /dev/null +++ b/datasets/flwr_datasets/partitioner/size_partitioner.py @@ -0,0 +1,135 @@ +# Copyright 2023 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""DataQuantityPartitioner class.""" +from typing import Callable, Dict, List, Union + +import numpy as np + +import datasets +from flwr_datasets.partitioner.partitioner import Partitioner + + +class SizePartitioner(Partitioner): + """Base class for the deterministic size partitioning schemes based on the cid. + + The client with cid has the following relationship regarding the number of samples. + cid_to_size_fnc(cid) ~ number of samples for cid + + If the function doesn't transform the cid it's a linear correlations of the number + of sample for cid and the value of cid. For instance, if the IDs range from 1 to M, + client with ID 1 gets 1 unit of data, client 2 gets 2 units, and so on, up to client + M which gets M units. + + Note that size corresponding to the cid is deterministic, yet in case of different + dataset shuffling the assignment of samples to cid will vary. + + Parameters + ---------- + num_partitions: int + The total number of partitions that the data will be divided into. + cid_to_size_fnc: Callable + Function that defines the relationship between cid and the number of samples. + """ + + def __init__( + self, + num_partitions: int, + cid_to_size_fnc: Callable, # type: ignore[type-arg] + ) -> None: + super().__init__() + if num_partitions <= 0: + raise ValueError("The number of partitions must be greater than zero.") + self._num_partitions = num_partitions + self._cid_to_size_fnc = cid_to_size_fnc + + self._cid_to_size: Dict[int, int] = {} + self._cid_to_indices: Dict[int, List[int]] = {} + # A flag to perform only a single compute to determine the indices + self._cid_to_indices_determined = False + + def load_partition(self, idx: int) -> datasets.Dataset: + """Load a single partition based on the partition index. + + For this partitioner the number of samples is dependent on the partition idx. + + Parameters + ---------- + idx: int + the index that corresponds to the requested partition + + Returns + ------- + dataset_partition: Dataset + single dataset partition + """ + # The partitioning is done lazily - only when the first partition is requested. + # A single run creates the indices assignments for all the partition indices. + self._determine_cid_to_indices_if_needed() + return self.dataset.select(self._cid_to_indices[idx]) + + @property + def cid_to_size(self) -> Dict[int, int]: + """Cid to the number of samples.""" + return self._cid_to_size + + @property + def cid_to_indices(self) -> Dict[int, List[int]]: + """Cid to the list of indices.""" + return self._cid_to_indices + + def _determine_cid_to_size(self) -> None: + """Determine data quantity associated with partition indices.""" + data_division_in_units = self._cid_to_size_fnc( + np.linspace(start=1, stop=self._num_partitions, num=self._num_partitions) + ) + total_units: Union[int, float] = data_division_in_units.sum() + # Normalize the units to get the fraction total dataset + partition_sizes_as_fraction = data_division_in_units / total_units + # Calculate the number of samples + partition_sizes_as_num_of_samples = np.array( + partition_sizes_as_fraction * len(self.dataset), dtype=np.int64 + ) + # Check if any sample is not allocated because of multiplication with fractions. + assigned_samples = np.sum(partition_sizes_as_num_of_samples) + left_unassigned_samples = len(self.dataset) - assigned_samples + # If there is any sample(s) left unassigned, assign it to the largest partition. + partition_sizes_as_num_of_samples[-1] += left_unassigned_samples + print(partition_sizes_as_num_of_samples) + for idx, partition_size in enumerate(partition_sizes_as_num_of_samples): + self._cid_to_size[idx] = partition_size + + self._check_if_cid_to_size_possible() + + def _determine_cid_to_indices_if_needed(self) -> None: + """Create an assignment of indices to the partition indices..""" + if self._cid_to_indices_determined is True: + return + self._determine_cid_to_size() + total_samples_assigned = 0 + for idx, quantity in self._cid_to_size.items(): + self._cid_to_indices[idx] = list( + range(total_samples_assigned, total_samples_assigned + quantity) + ) + total_samples_assigned += quantity + self._cid_to_indices_determined = True + + def _check_if_cid_to_size_possible(self) -> None: + all_positive = all(value > 0 for value in self.cid_to_size.values()) + if not all_positive: + raise ValueError( + f"The given specification of the parameter num_partitions" + f"={self._num_partitions} for the given dataset results " + f"in the partitions sizes that are not greater than 0." + ) From 60f47fc78615a94e599dbab3ac2c80385efefcf6 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 23 Oct 2023 16:40:31 +0200 Subject: [PATCH 02/17] Add size partitioner tests --- .../partitioner/size_partitioner_test.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 datasets/flwr_datasets/partitioner/size_partitioner_test.py diff --git a/datasets/flwr_datasets/partitioner/size_partitioner_test.py b/datasets/flwr_datasets/partitioner/size_partitioner_test.py new file mode 100644 index 000000000000..c5344cc01582 --- /dev/null +++ b/datasets/flwr_datasets/partitioner/size_partitioner_test.py @@ -0,0 +1,100 @@ +# Copyright 2023 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""SizePartitioner tests.""" +import unittest + +from parameterized import parameterized + +from datasets import Dataset +from flwr_datasets.partitioner.linear_partitioner import LinearPartitioner + + +def _dummy_dataset(num_rows: int) -> Dataset: + data = { + "features": list(range(num_rows)), + "labels": [i % 2 for i in range(num_rows)], + } + dataset = Dataset.from_dict(data) + return dataset + + +class TestLinearPartitioner(unittest.TestCase): + """Test LinearPartitioner.""" + + @parameterized.expand( # type: ignore + [ + (1, 100), + (10, 100), + (5, 55), # This will leave some undivided samples + ] + ) + def test_linear_distribution(self, num_partitions: int, num_rows: int) -> None: + """Test the linear distribution of samples.""" + dataset = _dummy_dataset(num_rows) + partitioner = LinearPartitioner(num_partitions=num_partitions) + partitioner.dataset = dataset + # Run a single partition loading to trigger the division + _ = partitioner.load_partition(0) + total_samples = sum(partitioner.cid_to_size.values()) + self.assertEqual(total_samples, num_rows) + + # Testing if each partition is getting more than the previous one + last_count = 0 + for i in range(num_partitions): + current_count = partitioner.cid_to_size[i] + self.assertGreaterEqual(current_count, last_count) + last_count = current_count + + @parameterized.expand( # type: ignore + [ + (10, 100), + (5, 55), # This will leave some undivided samples + (7, 77), # This will leave some undivided samples + ] + ) + def test_undivided_samples(self, num_partitions: int, num_rows: int) -> None: + """Test the logic for distributing undivided samples.""" + dataset = _dummy_dataset(num_rows) + partitioner = LinearPartitioner(num_partitions=num_partitions) + partitioner.dataset = dataset + # If there are any undivided samples, they should be added to the largest + # partition + last_partition_id = num_partitions - 1 + actual_samples_in_last_partition = len( + partitioner.load_partition(last_partition_id) + ) + expected_samples_in_last_partition = partitioner.cid_to_size[last_partition_id] + self.assertEqual( + expected_samples_in_last_partition, actual_samples_in_last_partition + ) + + def test_meaningless_params(self) -> None: + """Test if the params leading to partition size not greater than zero raises.""" + num_rows = 10 + num_partitions = 100 + dataset = _dummy_dataset(num_rows) + partitioner = LinearPartitioner(num_partitions=num_partitions) + partitioner.dataset = dataset + with self.assertRaises(ValueError) as context: + partitioner.load_partition(1) + self.assertIn( + "The given specification of the parameter num_partitions=10 for the given " + "dataset results in the partitions sizes that are not greater than 0.", + str(context.exception), + ) + + +if __name__ == "__main__": + unittest.main() From c595d2cfdfc4e055777efe3702b08c0c7744e48f Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 23 Oct 2023 16:40:57 +0200 Subject: [PATCH 03/17] Add linear partitioner --- .../partitioner/linear_partitioner.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 datasets/flwr_datasets/partitioner/linear_partitioner.py diff --git a/datasets/flwr_datasets/partitioner/linear_partitioner.py b/datasets/flwr_datasets/partitioner/linear_partitioner.py new file mode 100644 index 000000000000..a2543c72ab27 --- /dev/null +++ b/datasets/flwr_datasets/partitioner/linear_partitioner.py @@ -0,0 +1,36 @@ +# Copyright 2023 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""LinearPartitioner class.""" + +from flwr_datasets.partitioner.size_partitioner import SizePartitioner + + +class LinearPartitioner(SizePartitioner): + """Partitioner creates partitions of size that are linearly correlated with its id. + + The amount of data each client gets is linearly correlated with the client's ID. + For instance, if the IDs range from 1 to M, client with ID 1 gets 1 unit of data, + client 2 gets 2 units, and so on, up to client M which gets M units. + + Parameters + ---------- + num_partitions: int + The total number of partitions that the data will be divided into. + """ + + def __init__(self, num_partitions: int) -> None: + super().__init__(num_partitions=num_partitions, cid_to_size_fnc=lambda x: x) + if num_partitions <= 0: + raise ValueError("The number of partitions must be greater than zero.") From 0c5ee2767cc8136f05dcf4e7cc96c47c6c81b377 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 23 Oct 2023 16:41:04 +0200 Subject: [PATCH 04/17] Add square partitioner --- .../partitioner/square_partitioner.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 datasets/flwr_datasets/partitioner/square_partitioner.py diff --git a/datasets/flwr_datasets/partitioner/square_partitioner.py b/datasets/flwr_datasets/partitioner/square_partitioner.py new file mode 100644 index 000000000000..424b20702db9 --- /dev/null +++ b/datasets/flwr_datasets/partitioner/square_partitioner.py @@ -0,0 +1,37 @@ +# Copyright 2023 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""SquarePartitioner class.""" +import numpy as np + +from flwr_datasets.partitioner.size_partitioner import SizePartitioner + + +class SquarePartitioner(SizePartitioner): + """Partitioner creates partitions of size that are correlated with squared ids. + + The amount of data each client gets is correlated with the squared client's ID. + For instance, if the IDs range from 1 to M, client with ID 1 gets 1 unit of data, + client 2 gets 4 units, and so on, up to client M which gets M^2 units. + + Parameters + ---------- + num_partitions: int + The total number of partitions that the data will be divided into. + """ + + def __init__(self, num_partitions: int) -> None: + super().__init__(num_partitions=num_partitions, cid_to_size_fnc=np.square) + if num_partitions <= 0: + raise ValueError("The number of partitions must be greater than zero.") From 3e05d5ccac7215a3bcbd379d4a54e6c11297c672 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 23 Oct 2023 16:41:17 +0200 Subject: [PATCH 05/17] Add exponential partitioner --- .../partitioner/exponential_partitioner.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 datasets/flwr_datasets/partitioner/exponential_partitioner.py diff --git a/datasets/flwr_datasets/partitioner/exponential_partitioner.py b/datasets/flwr_datasets/partitioner/exponential_partitioner.py new file mode 100644 index 000000000000..4d278f83f7da --- /dev/null +++ b/datasets/flwr_datasets/partitioner/exponential_partitioner.py @@ -0,0 +1,37 @@ +# Copyright 2023 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""ExponentialPartitioner class.""" +import numpy as np + +from flwr_datasets.partitioner.size_partitioner import SizePartitioner + + +class ExponentialPartitioner(SizePartitioner): + """Partitioner creates partitions of size that are correlated with exp(ids). + + The amount of data each client gets is correlated with the squared client's ID. + For instance, if the IDs range from 1 to M, client with ID 1 gets e units of data, + client 2 gets e^2 units, and so on, up to client M which gets e^M units. + + Parameters + ---------- + num_partitions: int + The total number of partitions that the data will be divided into. + """ + + def __init__(self, num_partitions: int) -> None: + super().__init__(num_partitions=num_partitions, cid_to_size_fnc=np.exp) + if num_partitions <= 0: + raise ValueError("The number of partitions must be greater than zero.") From a37529ae3e95dbe4a3477af6ca134763bb41fc6b Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 23 Oct 2023 16:47:22 +0200 Subject: [PATCH 06/17] Remove print --- datasets/flwr_datasets/partitioner/size_partitioner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/size_partitioner.py b/datasets/flwr_datasets/partitioner/size_partitioner.py index e471a6836c7e..2c5d51122e7e 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner.py @@ -106,7 +106,6 @@ def _determine_cid_to_size(self) -> None: left_unassigned_samples = len(self.dataset) - assigned_samples # If there is any sample(s) left unassigned, assign it to the largest partition. partition_sizes_as_num_of_samples[-1] += left_unassigned_samples - print(partition_sizes_as_num_of_samples) for idx, partition_size in enumerate(partition_sizes_as_num_of_samples): self._cid_to_size[idx] = partition_size From c393c1f0725912006669ca7ac91a1f5d8229f12a Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 23 Oct 2023 17:03:15 +0200 Subject: [PATCH 07/17] Fix tests --- datasets/flwr_datasets/partitioner/size_partitioner_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/size_partitioner_test.py b/datasets/flwr_datasets/partitioner/size_partitioner_test.py index c5344cc01582..fa23fe18482f 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner_test.py @@ -90,7 +90,7 @@ def test_meaningless_params(self) -> None: with self.assertRaises(ValueError) as context: partitioner.load_partition(1) self.assertIn( - "The given specification of the parameter num_partitions=10 for the given " + "The given specification of the parameter num_partitions=100 for the given " "dataset results in the partitions sizes that are not greater than 0.", str(context.exception), ) From 614c6fc304a330cddedc9df3b410f99dfe4348d9 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 24 Oct 2023 15:39:45 +0200 Subject: [PATCH 08/17] Add new partitioners to init for easy imports --- datasets/flwr_datasets/partitioner/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/datasets/flwr_datasets/partitioner/__init__.py b/datasets/flwr_datasets/partitioner/__init__.py index d1eba7895702..83186a6d24ee 100644 --- a/datasets/flwr_datasets/partitioner/__init__.py +++ b/datasets/flwr_datasets/partitioner/__init__.py @@ -15,10 +15,18 @@ """Flower Datasets Partitioner package.""" +from .exponential_partitioner import ExponentialPartitioner from .iid_partitioner import IidPartitioner +from .linear_partitioner import LinearPartitioner from .partitioner import Partitioner +from .size_partitioner import SizePartitioner +from .square_partitioner import SquarePartitioner __all__ = [ "IidPartitioner", "Partitioner", + "SizePartitioner", + "LinearPartitioner", + "SquarePartitioner", + "ExponentialPartitioner", ] From 3038081c001295daa0b16a06fd347f10b8166c9d Mon Sep 17 00:00:00 2001 From: yan-gao-GY Date: Thu, 26 Oct 2023 20:15:15 +0100 Subject: [PATCH 09/17] Add jxie/higgs as tested_datasets --- datasets/flwr_datasets/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py index 7badb1445460..c3409f3f0454 100644 --- a/datasets/flwr_datasets/utils.py +++ b/datasets/flwr_datasets/utils.py @@ -28,6 +28,7 @@ "fashion_mnist", "sasha/dog-food", "zh-plus/tiny-imagenet", + "jxie/higgs", ] From d0f2cfc78239f6fc0783be6cf7023fa9b476d4c4 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 7 Nov 2023 10:22:56 +0100 Subject: [PATCH 10/17] Remove jxie/higgs from tested_datasets list --- datasets/flwr_datasets/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py index c3409f3f0454..7badb1445460 100644 --- a/datasets/flwr_datasets/utils.py +++ b/datasets/flwr_datasets/utils.py @@ -28,7 +28,6 @@ "fashion_mnist", "sasha/dog-food", "zh-plus/tiny-imagenet", - "jxie/higgs", ] From 05bba2d4aca4dcd24ac330a31a3bc211db55d8f7 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 7 Nov 2023 10:26:16 +0100 Subject: [PATCH 11/17] Clarify _check_if_cid_to_size_possible by using >= 1 instead >0 --- datasets/flwr_datasets/partitioner/size_partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/size_partitioner.py b/datasets/flwr_datasets/partitioner/size_partitioner.py index 2c5d51122e7e..0925f61b4a23 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner.py @@ -125,7 +125,7 @@ def _determine_cid_to_indices_if_needed(self) -> None: self._cid_to_indices_determined = True def _check_if_cid_to_size_possible(self) -> None: - all_positive = all(value > 0 for value in self.cid_to_size.values()) + all_positive = all(value >= 1 for value in self.cid_to_size.values()) if not all_positive: raise ValueError( f"The given specification of the parameter num_partitions" From b5d456ec2713b8c1347e7f00a8594bf7d9a07a08 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 7 Nov 2023 10:50:16 +0100 Subject: [PATCH 12/17] Rename cid to id (in each method) --- .../partitioner/size_partitioner.py | 18 +++++++++--------- .../partitioner/size_partitioner_test.py | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/size_partitioner.py b/datasets/flwr_datasets/partitioner/size_partitioner.py index 0925f61b4a23..575dfa9d8ce4 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner.py @@ -76,20 +76,20 @@ def load_partition(self, idx: int) -> datasets.Dataset: """ # The partitioning is done lazily - only when the first partition is requested. # A single run creates the indices assignments for all the partition indices. - self._determine_cid_to_indices_if_needed() + self._determine_id_to_indices_if_needed() return self.dataset.select(self._cid_to_indices[idx]) @property - def cid_to_size(self) -> Dict[int, int]: + def id_to_size(self) -> Dict[int, int]: """Cid to the number of samples.""" return self._cid_to_size @property - def cid_to_indices(self) -> Dict[int, List[int]]: + def id_to_indices(self) -> Dict[int, List[int]]: """Cid to the list of indices.""" return self._cid_to_indices - def _determine_cid_to_size(self) -> None: + def _determine_id_to_size(self) -> None: """Determine data quantity associated with partition indices.""" data_division_in_units = self._cid_to_size_fnc( np.linspace(start=1, stop=self._num_partitions, num=self._num_partitions) @@ -109,13 +109,13 @@ def _determine_cid_to_size(self) -> None: for idx, partition_size in enumerate(partition_sizes_as_num_of_samples): self._cid_to_size[idx] = partition_size - self._check_if_cid_to_size_possible() + self._check_if_id_to_size_possible() - def _determine_cid_to_indices_if_needed(self) -> None: + def _determine_id_to_indices_if_needed(self) -> None: """Create an assignment of indices to the partition indices..""" if self._cid_to_indices_determined is True: return - self._determine_cid_to_size() + self._determine_id_to_size() total_samples_assigned = 0 for idx, quantity in self._cid_to_size.items(): self._cid_to_indices[idx] = list( @@ -124,8 +124,8 @@ def _determine_cid_to_indices_if_needed(self) -> None: total_samples_assigned += quantity self._cid_to_indices_determined = True - def _check_if_cid_to_size_possible(self) -> None: - all_positive = all(value >= 1 for value in self.cid_to_size.values()) + def _check_if_id_to_size_possible(self) -> None: + all_positive = all(value >= 1 for value in self.id_to_size.values()) if not all_positive: raise ValueError( f"The given specification of the parameter num_partitions" diff --git a/datasets/flwr_datasets/partitioner/size_partitioner_test.py b/datasets/flwr_datasets/partitioner/size_partitioner_test.py index fa23fe18482f..6bcdb59e7cd6 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner_test.py @@ -47,13 +47,13 @@ def test_linear_distribution(self, num_partitions: int, num_rows: int) -> None: partitioner.dataset = dataset # Run a single partition loading to trigger the division _ = partitioner.load_partition(0) - total_samples = sum(partitioner.cid_to_size.values()) + total_samples = sum(partitioner.id_to_size.values()) self.assertEqual(total_samples, num_rows) # Testing if each partition is getting more than the previous one last_count = 0 for i in range(num_partitions): - current_count = partitioner.cid_to_size[i] + current_count = partitioner.id_to_size[i] self.assertGreaterEqual(current_count, last_count) last_count = current_count @@ -75,7 +75,7 @@ def test_undivided_samples(self, num_partitions: int, num_rows: int) -> None: actual_samples_in_last_partition = len( partitioner.load_partition(last_partition_id) ) - expected_samples_in_last_partition = partitioner.cid_to_size[last_partition_id] + expected_samples_in_last_partition = partitioner.id_to_size[last_partition_id] self.assertEqual( expected_samples_in_last_partition, actual_samples_in_last_partition ) From 55691d91e478654415c43b3a5c7b1183672387e4 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 7 Nov 2023 11:01:29 +0100 Subject: [PATCH 13/17] Clarify documentation --- .../partitioner/exponential_partitioner.py | 12 ++++++++---- .../flwr_datasets/partitioner/linear_partitioner.py | 4 ++-- .../flwr_datasets/partitioner/square_partitioner.py | 4 ++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/exponential_partitioner.py b/datasets/flwr_datasets/partitioner/exponential_partitioner.py index 4d278f83f7da..7e2bd85bfdf6 100644 --- a/datasets/flwr_datasets/partitioner/exponential_partitioner.py +++ b/datasets/flwr_datasets/partitioner/exponential_partitioner.py @@ -19,11 +19,15 @@ class ExponentialPartitioner(SizePartitioner): - """Partitioner creates partitions of size that are correlated with exp(ids). + """Partitioner creates partitions of size that are correlated with exp(idx). - The amount of data each client gets is correlated with the squared client's ID. - For instance, if the IDs range from 1 to M, client with ID 1 gets e units of data, - client 2 gets e^2 units, and so on, up to client M which gets e^M units. + The amount of data each client gets is correlated with the exponent of partition ID. + For instance, if the IDs range from 1 to M, client with ID 1 gets e units of + data, client 2 gets e^2 units, and so on, up to client M which gets e^M units. + The floor operation is applied on each of these numbers, it means floor(2.71...) + = 2; e^2 ~ 7.39 floor(7.39) = 7. The number is rounded down = the fraction is + always cut. The remainders of theses unassigned (fraction) samples is added to the + biggest partition (the one with the biggest idx). Parameters ---------- diff --git a/datasets/flwr_datasets/partitioner/linear_partitioner.py b/datasets/flwr_datasets/partitioner/linear_partitioner.py index a2543c72ab27..a8e7e7790d61 100644 --- a/datasets/flwr_datasets/partitioner/linear_partitioner.py +++ b/datasets/flwr_datasets/partitioner/linear_partitioner.py @@ -18,9 +18,9 @@ class LinearPartitioner(SizePartitioner): - """Partitioner creates partitions of size that are linearly correlated with its id. + """Partitioner creates partitions of size that are linearly correlated with idx. - The amount of data each client gets is linearly correlated with the client's ID. + The amount of data each client gets is linearly correlated with the partition ID. For instance, if the IDs range from 1 to M, client with ID 1 gets 1 unit of data, client 2 gets 2 units, and so on, up to client M which gets M units. diff --git a/datasets/flwr_datasets/partitioner/square_partitioner.py b/datasets/flwr_datasets/partitioner/square_partitioner.py index 424b20702db9..8b683be4c589 100644 --- a/datasets/flwr_datasets/partitioner/square_partitioner.py +++ b/datasets/flwr_datasets/partitioner/square_partitioner.py @@ -19,9 +19,9 @@ class SquarePartitioner(SizePartitioner): - """Partitioner creates partitions of size that are correlated with squared ids. + """Partitioner creates partitions of size that are correlated with squared idx. - The amount of data each client gets is correlated with the squared client's ID. + The amount of data each client gets is correlated with the squared partition ID. For instance, if the IDs range from 1 to M, client with ID 1 gets 1 unit of data, client 2 gets 4 units, and so on, up to client M which gets M^2 units. From 32c711722b6716de4372d062664a91b1f69debf2 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 7 Nov 2023 12:33:03 +0100 Subject: [PATCH 14/17] Apply review suggestion --- .../partitioner/exponential_partitioner.py | 6 ++++-- .../partitioner/linear_partitioner.py | 5 +++-- .../partitioner/size_partitioner.py | 16 +++++++++------- .../partitioner/size_partitioner_test.py | 2 ++ .../partitioner/square_partitioner.py | 6 ++++-- 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/exponential_partitioner.py b/datasets/flwr_datasets/partitioner/exponential_partitioner.py index 7e2bd85bfdf6..48ac59dbe42a 100644 --- a/datasets/flwr_datasets/partitioner/exponential_partitioner.py +++ b/datasets/flwr_datasets/partitioner/exponential_partitioner.py @@ -13,6 +13,8 @@ # limitations under the License. # ============================================================================== """ExponentialPartitioner class.""" + + import numpy as np from flwr_datasets.partitioner.size_partitioner import SizePartitioner @@ -31,11 +33,11 @@ class ExponentialPartitioner(SizePartitioner): Parameters ---------- - num_partitions: int + num_partitions : int The total number of partitions that the data will be divided into. """ def __init__(self, num_partitions: int) -> None: - super().__init__(num_partitions=num_partitions, cid_to_size_fnc=np.exp) + super().__init__(num_partitions=num_partitions, cid_to_size_fn=np.exp) if num_partitions <= 0: raise ValueError("The number of partitions must be greater than zero.") diff --git a/datasets/flwr_datasets/partitioner/linear_partitioner.py b/datasets/flwr_datasets/partitioner/linear_partitioner.py index a8e7e7790d61..9af8cfb137ca 100644 --- a/datasets/flwr_datasets/partitioner/linear_partitioner.py +++ b/datasets/flwr_datasets/partitioner/linear_partitioner.py @@ -14,6 +14,7 @@ # ============================================================================== """LinearPartitioner class.""" + from flwr_datasets.partitioner.size_partitioner import SizePartitioner @@ -26,11 +27,11 @@ class LinearPartitioner(SizePartitioner): Parameters ---------- - num_partitions: int + num_partitions : int The total number of partitions that the data will be divided into. """ def __init__(self, num_partitions: int) -> None: - super().__init__(num_partitions=num_partitions, cid_to_size_fnc=lambda x: x) + super().__init__(num_partitions=num_partitions, cid_to_size_fn=lambda x: x) if num_partitions <= 0: raise ValueError("The number of partitions must be greater than zero.") diff --git a/datasets/flwr_datasets/partitioner/size_partitioner.py b/datasets/flwr_datasets/partitioner/size_partitioner.py index 575dfa9d8ce4..46eea2907c9b 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""DataQuantityPartitioner class.""" +"""SizePartitioner class.""" + + from typing import Callable, Dict, List, Union import numpy as np @@ -37,22 +39,22 @@ class SizePartitioner(Partitioner): Parameters ---------- - num_partitions: int + num_partitions : int The total number of partitions that the data will be divided into. - cid_to_size_fnc: Callable + cid_to_size_fn : Callable Function that defines the relationship between cid and the number of samples. """ def __init__( self, num_partitions: int, - cid_to_size_fnc: Callable, # type: ignore[type-arg] + cid_to_size_fn: Callable, # type: ignore[type-arg] ) -> None: super().__init__() if num_partitions <= 0: raise ValueError("The number of partitions must be greater than zero.") self._num_partitions = num_partitions - self._cid_to_size_fnc = cid_to_size_fnc + self._cid_to_size_fn = cid_to_size_fn self._cid_to_size: Dict[int, int] = {} self._cid_to_indices: Dict[int, List[int]] = {} @@ -66,7 +68,7 @@ def load_partition(self, idx: int) -> datasets.Dataset: Parameters ---------- - idx: int + idx : int the index that corresponds to the requested partition Returns @@ -91,7 +93,7 @@ def id_to_indices(self) -> Dict[int, List[int]]: def _determine_id_to_size(self) -> None: """Determine data quantity associated with partition indices.""" - data_division_in_units = self._cid_to_size_fnc( + data_division_in_units = self._cid_to_size_fn( np.linspace(start=1, stop=self._num_partitions, num=self._num_partitions) ) total_units: Union[int, float] = data_division_in_units.sum() diff --git a/datasets/flwr_datasets/partitioner/size_partitioner_test.py b/datasets/flwr_datasets/partitioner/size_partitioner_test.py index 6bcdb59e7cd6..dd7d70253d9a 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner_test.py @@ -13,6 +13,8 @@ # limitations under the License. # ============================================================================== """SizePartitioner tests.""" + + import unittest from parameterized import parameterized diff --git a/datasets/flwr_datasets/partitioner/square_partitioner.py b/datasets/flwr_datasets/partitioner/square_partitioner.py index 8b683be4c589..1619f2826311 100644 --- a/datasets/flwr_datasets/partitioner/square_partitioner.py +++ b/datasets/flwr_datasets/partitioner/square_partitioner.py @@ -13,6 +13,8 @@ # limitations under the License. # ============================================================================== """SquarePartitioner class.""" + + import numpy as np from flwr_datasets.partitioner.size_partitioner import SizePartitioner @@ -27,11 +29,11 @@ class SquarePartitioner(SizePartitioner): Parameters ---------- - num_partitions: int + num_partitions : int The total number of partitions that the data will be divided into. """ def __init__(self, num_partitions: int) -> None: - super().__init__(num_partitions=num_partitions, cid_to_size_fnc=np.square) + super().__init__(num_partitions=num_partitions, cid_to_size_fn=np.square) if num_partitions <= 0: raise ValueError("The number of partitions must be greater than zero.") From e048cc6f2230cffe7fbcc1752b6da3a7882bff85 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 7 Nov 2023 12:42:55 +0100 Subject: [PATCH 15/17] Rename cid to node_id --- .../partitioner/exponential_partitioner.py | 2 +- .../partitioner/linear_partitioner.py | 2 +- .../partitioner/size_partitioner.py | 75 ++++++++++--------- .../partitioner/size_partitioner_test.py | 8 +- .../partitioner/square_partitioner.py | 2 +- 5 files changed, 47 insertions(+), 42 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/exponential_partitioner.py b/datasets/flwr_datasets/partitioner/exponential_partitioner.py index 48ac59dbe42a..07bb07cea9a5 100644 --- a/datasets/flwr_datasets/partitioner/exponential_partitioner.py +++ b/datasets/flwr_datasets/partitioner/exponential_partitioner.py @@ -38,6 +38,6 @@ class ExponentialPartitioner(SizePartitioner): """ def __init__(self, num_partitions: int) -> None: - super().__init__(num_partitions=num_partitions, cid_to_size_fn=np.exp) + super().__init__(num_partitions=num_partitions, node_id_to_size_fn=np.exp) if num_partitions <= 0: raise ValueError("The number of partitions must be greater than zero.") diff --git a/datasets/flwr_datasets/partitioner/linear_partitioner.py b/datasets/flwr_datasets/partitioner/linear_partitioner.py index 9af8cfb137ca..a9dc8b020c08 100644 --- a/datasets/flwr_datasets/partitioner/linear_partitioner.py +++ b/datasets/flwr_datasets/partitioner/linear_partitioner.py @@ -32,6 +32,6 @@ class LinearPartitioner(SizePartitioner): """ def __init__(self, num_partitions: int) -> None: - super().__init__(num_partitions=num_partitions, cid_to_size_fn=lambda x: x) + super().__init__(num_partitions=num_partitions, node_id_to_size_fn=lambda x: x) if num_partitions <= 0: raise ValueError("The number of partitions must be greater than zero.") diff --git a/datasets/flwr_datasets/partitioner/size_partitioner.py b/datasets/flwr_datasets/partitioner/size_partitioner.py index 46eea2907c9b..dda31b4aa016 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner.py @@ -24,42 +24,45 @@ class SizePartitioner(Partitioner): - """Base class for the deterministic size partitioning schemes based on the cid. + """Base class for the deterministic size partitioning schemes based on the node id. - The client with cid has the following relationship regarding the number of samples. - cid_to_size_fnc(cid) ~ number of samples for cid + The client with node is has the following relationship regarding the number of + samples. - If the function doesn't transform the cid it's a linear correlations of the number - of sample for cid and the value of cid. For instance, if the IDs range from 1 to M, - client with ID 1 gets 1 unit of data, client 2 gets 2 units, and so on, up to client - M which gets M units. + node_id_to_size_fn(node_id) ~ number of samples for node_id - Note that size corresponding to the cid is deterministic, yet in case of different - dataset shuffling the assignment of samples to cid will vary. + If the function doesn't transform the node_id it's a linear correlations between + the number of sample for the node and the value of node_id. For instance, if the + node ids range from 1 to M, node with id 1 gets 1 unit of data, client 2 gets 2 + units, and so on, up to node M which gets M units. + + Note that size corresponding to the node_id is deterministic, yet in case of + different dataset shuffling the assignment of samples to node_id will vary. Parameters ---------- num_partitions : int The total number of partitions that the data will be divided into. - cid_to_size_fn : Callable - Function that defines the relationship between cid and the number of samples. + node_id_to_size_fn : Callable + Function that defines the relationship between node id and the number of + samples. """ def __init__( self, num_partitions: int, - cid_to_size_fn: Callable, # type: ignore[type-arg] + node_id_to_size_fn: Callable, # type: ignore[type-arg] ) -> None: super().__init__() if num_partitions <= 0: raise ValueError("The number of partitions must be greater than zero.") self._num_partitions = num_partitions - self._cid_to_size_fn = cid_to_size_fn + self._node_id_to_size_fn = node_id_to_size_fn - self._cid_to_size: Dict[int, int] = {} - self._cid_to_indices: Dict[int, List[int]] = {} + self._node_id_to_size: Dict[int, int] = {} + self._node_id_to_indices: Dict[int, List[int]] = {} # A flag to perform only a single compute to determine the indices - self._cid_to_indices_determined = False + self._node_id_to_indices_determined = False def load_partition(self, idx: int) -> datasets.Dataset: """Load a single partition based on the partition index. @@ -78,22 +81,22 @@ def load_partition(self, idx: int) -> datasets.Dataset: """ # The partitioning is done lazily - only when the first partition is requested. # A single run creates the indices assignments for all the partition indices. - self._determine_id_to_indices_if_needed() - return self.dataset.select(self._cid_to_indices[idx]) + self._determine_node_id_to_indices_if_needed() + return self.dataset.select(self._node_id_to_indices[idx]) @property - def id_to_size(self) -> Dict[int, int]: - """Cid to the number of samples.""" - return self._cid_to_size + def node_id_to_size(self) -> Dict[int, int]: + """Node id to the number of samples.""" + return self._node_id_to_size @property - def id_to_indices(self) -> Dict[int, List[int]]: - """Cid to the list of indices.""" - return self._cid_to_indices + def node_id_to_indices(self) -> Dict[int, List[int]]: + """Node id to the list of indices.""" + return self._node_id_to_indices - def _determine_id_to_size(self) -> None: + def _determine_node_id_to_size(self) -> None: """Determine data quantity associated with partition indices.""" - data_division_in_units = self._cid_to_size_fn( + data_division_in_units = self._node_id_to_size_fn( np.linspace(start=1, stop=self._num_partitions, num=self._num_partitions) ) total_units: Union[int, float] = data_division_in_units.sum() @@ -109,25 +112,25 @@ def _determine_id_to_size(self) -> None: # If there is any sample(s) left unassigned, assign it to the largest partition. partition_sizes_as_num_of_samples[-1] += left_unassigned_samples for idx, partition_size in enumerate(partition_sizes_as_num_of_samples): - self._cid_to_size[idx] = partition_size + self._node_id_to_size[idx] = partition_size - self._check_if_id_to_size_possible() + self._check_if_node_id_to_size_possible() - def _determine_id_to_indices_if_needed(self) -> None: + def _determine_node_id_to_indices_if_needed(self) -> None: """Create an assignment of indices to the partition indices..""" - if self._cid_to_indices_determined is True: + if self._node_id_to_indices_determined is True: return - self._determine_id_to_size() + self._determine_node_id_to_size() total_samples_assigned = 0 - for idx, quantity in self._cid_to_size.items(): - self._cid_to_indices[idx] = list( + for idx, quantity in self._node_id_to_size.items(): + self._node_id_to_indices[idx] = list( range(total_samples_assigned, total_samples_assigned + quantity) ) total_samples_assigned += quantity - self._cid_to_indices_determined = True + self._node_id_to_indices_determined = True - def _check_if_id_to_size_possible(self) -> None: - all_positive = all(value >= 1 for value in self.id_to_size.values()) + def _check_if_node_id_to_size_possible(self) -> None: + all_positive = all(value >= 1 for value in self.node_id_to_size.values()) if not all_positive: raise ValueError( f"The given specification of the parameter num_partitions" diff --git a/datasets/flwr_datasets/partitioner/size_partitioner_test.py b/datasets/flwr_datasets/partitioner/size_partitioner_test.py index dd7d70253d9a..390f6a613fce 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner_test.py @@ -49,13 +49,13 @@ def test_linear_distribution(self, num_partitions: int, num_rows: int) -> None: partitioner.dataset = dataset # Run a single partition loading to trigger the division _ = partitioner.load_partition(0) - total_samples = sum(partitioner.id_to_size.values()) + total_samples = sum(partitioner.node_id_to_size.values()) self.assertEqual(total_samples, num_rows) # Testing if each partition is getting more than the previous one last_count = 0 for i in range(num_partitions): - current_count = partitioner.id_to_size[i] + current_count = partitioner.node_id_to_size[i] self.assertGreaterEqual(current_count, last_count) last_count = current_count @@ -77,7 +77,9 @@ def test_undivided_samples(self, num_partitions: int, num_rows: int) -> None: actual_samples_in_last_partition = len( partitioner.load_partition(last_partition_id) ) - expected_samples_in_last_partition = partitioner.id_to_size[last_partition_id] + expected_samples_in_last_partition = partitioner.node_id_to_size[ + last_partition_id + ] self.assertEqual( expected_samples_in_last_partition, actual_samples_in_last_partition ) diff --git a/datasets/flwr_datasets/partitioner/square_partitioner.py b/datasets/flwr_datasets/partitioner/square_partitioner.py index 1619f2826311..25affaf304ef 100644 --- a/datasets/flwr_datasets/partitioner/square_partitioner.py +++ b/datasets/flwr_datasets/partitioner/square_partitioner.py @@ -34,6 +34,6 @@ class SquarePartitioner(SizePartitioner): """ def __init__(self, num_partitions: int) -> None: - super().__init__(num_partitions=num_partitions, cid_to_size_fn=np.square) + super().__init__(num_partitions=num_partitions, node_id_to_size_fn=np.square) if num_partitions <= 0: raise ValueError("The number of partitions must be greater than zero.") From 620924f0f85614a121480c5bb4194ec8a689782f Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Tue, 7 Nov 2023 13:09:42 +0100 Subject: [PATCH 16/17] Apply suggestions from code review Co-authored-by: Daniel J. Beutel --- .../flwr_datasets/partitioner/size_partitioner.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/size_partitioner.py b/datasets/flwr_datasets/partitioner/size_partitioner.py index dda31b4aa016..7fbe4fd2f70d 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner.py @@ -24,20 +24,20 @@ class SizePartitioner(Partitioner): - """Base class for the deterministic size partitioning schemes based on the node id. + """Base class for the deterministic size partitioning schemes based on the `node_id`. - The client with node is has the following relationship regarding the number of + The client with `node_id` has the following relationship regarding the number of samples. - node_id_to_size_fn(node_id) ~ number of samples for node_id + `node_id_to_size_fn(node_id)` ~ number of samples for `node_id` - If the function doesn't transform the node_id it's a linear correlations between - the number of sample for the node and the value of node_id. For instance, if the + If the function doesn't transform the `node_id` it's a linear correlation between + the number of sample for the node and the value of `node_id`. For instance, if the node ids range from 1 to M, node with id 1 gets 1 unit of data, client 2 gets 2 units, and so on, up to node M which gets M units. - Note that size corresponding to the node_id is deterministic, yet in case of - different dataset shuffling the assignment of samples to node_id will vary. + Note that size corresponding to the `node_id` is deterministic, yet in case of + different dataset shuffling the assignment of samples to `node_id` will vary. Parameters ---------- From def2d437394be7dead26adf82076832e493cc35e Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 7 Nov 2023 13:12:00 +0100 Subject: [PATCH 17/17] Change SizePartitioner doc to make line width requirement --- datasets/flwr_datasets/partitioner/size_partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/size_partitioner.py b/datasets/flwr_datasets/partitioner/size_partitioner.py index 7fbe4fd2f70d..27e1f528dc52 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner.py @@ -24,7 +24,7 @@ class SizePartitioner(Partitioner): - """Base class for the deterministic size partitioning schemes based on the `node_id`. + """Base class for the deterministic size partitioning based on the `node_id`. The client with `node_id` has the following relationship regarding the number of samples.