From 131b55e77e9e51de13d41cd85c712b9d52fdca24 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 21 Sep 2023 15:00:43 +0200 Subject: [PATCH 01/10] Add cid partitioner --- .../flwr_datasets/partitioner/__init__.py | 2 + .../partitioner/cid_partitioner.py | 78 +++++++++ .../partitioner/cid_partitioner_test.py | 159 ++++++++++++++++++ 3 files changed, 239 insertions(+) create mode 100644 datasets/flwr_datasets/partitioner/cid_partitioner.py create mode 100644 datasets/flwr_datasets/partitioner/cid_partitioner_test.py diff --git a/datasets/flwr_datasets/partitioner/__init__.py b/datasets/flwr_datasets/partitioner/__init__.py index d1eba7895702..6be075c40a08 100644 --- a/datasets/flwr_datasets/partitioner/__init__.py +++ b/datasets/flwr_datasets/partitioner/__init__.py @@ -17,8 +17,10 @@ from .iid_partitioner import IidPartitioner from .partitioner import Partitioner +from .cid_partitioner import CidPartitioner __all__ = [ "IidPartitioner", "Partitioner", + "CidPartitioner" ] diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner.py b/datasets/flwr_datasets/partitioner/cid_partitioner.py new file mode 100644 index 000000000000..3cb6b176cf04 --- /dev/null +++ b/datasets/flwr_datasets/partitioner/cid_partitioner.py @@ -0,0 +1,78 @@ +# Copyright 2023 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Cid partitioner class that works with Hugging Face Datasets.""" +from typing import Dict, List + +import datasets +from flwr_datasets.partitioner.partitioner import Partitioner + + +class CidPartitioner(Partitioner): + """Partitioner for dataset that can be divided by reference to clients.""" + + def __init__( + self, + partition_by: str, + ): + super().__init__() + self._index_to_cid: Dict[int, str] = {} + self._partition_by = partition_by + self._idx_to_rows: Dict[int, List[int]] = {} + + def _create_int_idx_to_cid(self): + """Create a mapping from int indices to unique client ids. + + Client ids come from the columns specified in `partition_by`. + """ + unique_cid = self._dataset.unique(self._partition_by) + self._index_to_cid = dict(zip(range(len(unique_cid)), unique_cid)) + + def _save_partition_indexing(self, idx: int, rows: List[int]): + """Store the rows corresponding to the partition of idx. + + It should be used only after the `load_partition` is used. + """ + self._idx_to_rows[idx] = rows + + def load_partition(self, idx: int) -> datasets.Dataset: + """Load a single partition corresponding to a single CID. + + The choice of the partition is based on unique integers assigned to each cid. + + Parameters + ---------- + idx: int + the index that corresponds to the requested partition + + Returns + ------- + dataset_partition: Dataset + single dataset partition + """ + if len(self._index_to_cid) == 0: + self._create_int_idx_to_cid() + + return self._dataset.filter( + lambda row: row[self._partition_by] == self._index_to_cid[idx] + ) + + @property + def index_to_cid(self) -> Dict[int, str]: + """Index to corresponding cid from the dataset property.""" + return self._index_to_cid + + @index_to_cid.setter + def index_to_cid(self, value: Dict[int, str]) -> None: + raise AttributeError("Setting the index_to_cid dictionary is not allowed.") diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py new file mode 100644 index 000000000000..84fac2c39024 --- /dev/null +++ b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py @@ -0,0 +1,159 @@ +# Copyright 2023 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable alaw or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""CID partitioner tests.""" +import itertools +import math +import unittest +from typing import Tuple + +from parameterized import parameterized + +from datasets import Dataset +from flwr_datasets.partitioner.cid_partitioner import CidPartitioner + + +def _dummy_setup(num_rows: int, n_unique_cids: int) -> Tuple[Dataset, CidPartitioner]: + """Create a dummy dataset and partitioner based on given arguments. + + The partitioner has automatically the dataset assigned to it. + """ + dataset = _create_dataset(num_rows, n_unique_cids) + partitioner = CidPartitioner(partition_by="cid") + partitioner.dataset = dataset + return dataset, partitioner + + +def _create_dataset(num_rows: int, n_unique_cids: int): + """Create dataset based on the number of rows and unique cids.""" + data = { + "features": list(range(num_rows)), + "cid": [f"{i % n_unique_cids}" for i in range(num_rows)], + "labels": [i % 2 for i in range(num_rows)], + } + dataset = Dataset.from_dict(data) + return dataset + + +class TestCidPartitioner(unittest.TestCase): + """Test IidPartitioner.""" + + @parameterized.expand( # type: ignore + # num_rows, num_unique_cids + list(itertools.product([10, 30, 100, 1000], [2, 3, 4, 5])) + ) + def test_load_partition_num_partitions(self, num_rows: int, + num_unique_cid: int) -> None: + """Test if the number of partitions match the number of unique cids. + + Only the correct data is tested in this method. + """ + _, partitioner = _dummy_setup(num_rows, num_unique_cid) + # Simulate usage to start lazy index_to_cid creation + _ = partitioner.load_partition(0) + self.assertEqual(len(partitioner.index_to_cid), num_unique_cid) + + @parameterized.expand( # type: ignore + # num_rows, num_unique_cids + list(itertools.product([10, 30, 100, 1000], [2, 3, 4, 5])) + ) + def test_load_partition_max_partition_size(self, num_rows: int, + num_unique_cid: int) -> None: + """Test if the number of partitions match the number of unique cids. + + Only the correct data is tested in this method. + """ + print(num_rows) + print(num_unique_cid) + _, partitioner = _dummy_setup(num_rows, num_unique_cid) + max_size = max([len(partitioner.load_partition(i)) for i in range(num_unique_cid)]) + self.assertEqual(max_size, math.ceil(num_rows / num_unique_cid)) + + def test_partitioner_with_non_existing_column_partition_by( + self + ) -> None: + """Test error when the partition_by columns does not exist.""" + dataset = _create_dataset(10, 2) + partitioner = CidPartitioner(partition_by="not-cid") + partitioner.dataset = dataset + with self.assertRaises(ValueError): + partitioner.load_partition(0) + + # @parameterized.expand( # type: ignore + # [ + # # num_partitions, num_rows, partition_index + # (10, 10, 10), + # (10, 10, -1), + # (10, 10, 11), + # (10, 100, 1000), + # (5, 50, 60), + # (20, 200, 210), + # ] + # ) + # def test_load_invalid_partition_index( + # self, num_partitions: int, num_rows: int, partition_index: int + # ) -> None: + # """Test loading a partition with an index out of range.""" + # _, partitioner = _dummy_setup(num_partitions, num_rows) + # with self.assertRaises(ValueError): + # partitioner.load_partition(partition_index) + # + # def test_is_dataset_assigned_false(self) -> None: + # """Test if the is_dataset_assigned method works correctly if not assigned.""" + # partitioner = IidPartitioner(num_partitions=10) + # + # # Initially, the dataset should not be assigned + # self.assertFalse(partitioner.is_dataset_assigned()) + # + # def test_is_dataset_assigned_true(self) -> None: + # """Test if the is_dataset_assigned method works correctly if assigned.""" + # num_partitions = 10 + # num_rows = 100 + # _, partitioner = _dummy_setup(num_partitions, num_rows) + # self.assertTrue(partitioner.is_dataset_assigned()) + # + # def test_dataset_setter(self) -> None: + # """Test the dataset.setter method.""" + # num_partitions = 10 + # num_rows = 100 + # dataset, partitioner = _dummy_setup(num_partitions, num_rows) + # + # # It should not allow setting the dataset a second time + # with self.assertRaises(Exception) as context: + # partitioner.dataset = dataset + # self.assertIn( + # "The dataset should be assigned only once", str(context.exception) + # ) + # + # def test_dataset_getter_raises(self) -> None: + # """Test the dataset getter method.""" + # num_partitions = 10 + # partitioner = IidPartitioner(num_partitions=num_partitions) + # with self.assertRaises(AttributeError) as context: + # _ = partitioner.dataset + # self.assertIn( + # "The dataset field should be set before using it", str(context.exception) + # ) + # + # def test_dataset_getter_used_correctly(self) -> None: + # """Test the dataset getter method.""" + # num_partitions = 10 + # num_rows = 100 + # dataset, partitioner = _dummy_setup(num_partitions, num_rows) + # # After setting, it should return the dataset + # self.assertEqual(partitioner.dataset, dataset) + + +if __name__ == "__main__": + unittest.main() From df3363575994816fe9c9a64528eb1b274b033ef8 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 25 Sep 2023 14:17:43 +0200 Subject: [PATCH 02/10] Fix formatting --- datasets/flwr_datasets/partitioner/__init__.py | 8 ++------ .../partitioner/cid_partitioner_test.py | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/__init__.py b/datasets/flwr_datasets/partitioner/__init__.py index 6be075c40a08..acc22a480348 100644 --- a/datasets/flwr_datasets/partitioner/__init__.py +++ b/datasets/flwr_datasets/partitioner/__init__.py @@ -15,12 +15,8 @@ """Flower Datasets Partitioner package.""" +from .cid_partitioner import CidPartitioner from .iid_partitioner import IidPartitioner from .partitioner import Partitioner -from .cid_partitioner import CidPartitioner -__all__ = [ - "IidPartitioner", - "Partitioner", - "CidPartitioner" -] +__all__ = ["IidPartitioner", "Partitioner", "CidPartitioner"] diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py index 84fac2c39024..8fa43886be93 100644 --- a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py @@ -53,8 +53,9 @@ class TestCidPartitioner(unittest.TestCase): # num_rows, num_unique_cids list(itertools.product([10, 30, 100, 1000], [2, 3, 4, 5])) ) - def test_load_partition_num_partitions(self, num_rows: int, - num_unique_cid: int) -> None: + def test_load_partition_num_partitions( + self, num_rows: int, num_unique_cid: int + ) -> None: """Test if the number of partitions match the number of unique cids. Only the correct data is tested in this method. @@ -68,8 +69,9 @@ def test_load_partition_num_partitions(self, num_rows: int, # num_rows, num_unique_cids list(itertools.product([10, 30, 100, 1000], [2, 3, 4, 5])) ) - def test_load_partition_max_partition_size(self, num_rows: int, - num_unique_cid: int) -> None: + def test_load_partition_max_partition_size( + self, num_rows: int, num_unique_cid: int + ) -> None: """Test if the number of partitions match the number of unique cids. Only the correct data is tested in this method. @@ -77,12 +79,12 @@ def test_load_partition_max_partition_size(self, num_rows: int, print(num_rows) print(num_unique_cid) _, partitioner = _dummy_setup(num_rows, num_unique_cid) - max_size = max([len(partitioner.load_partition(i)) for i in range(num_unique_cid)]) + max_size = max( + [len(partitioner.load_partition(i)) for i in range(num_unique_cid)] + ) self.assertEqual(max_size, math.ceil(num_rows / num_unique_cid)) - def test_partitioner_with_non_existing_column_partition_by( - self - ) -> None: + def test_partitioner_with_non_existing_column_partition_by(self) -> None: """Test error when the partition_by columns does not exist.""" dataset = _create_dataset(10, 2) partitioner = CidPartitioner(partition_by="not-cid") From bea36aa057e5bb889ba2a3c13c785f391b15ee06 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 25 Sep 2023 14:34:26 +0200 Subject: [PATCH 03/10] Finish up CidPartitioner tests --- .../partitioner/cid_partitioner_test.py | 79 ++++--------------- 1 file changed, 16 insertions(+), 63 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py index 8fa43886be93..2c370143e9a2 100644 --- a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py @@ -92,69 +92,22 @@ def test_partitioner_with_non_existing_column_partition_by(self) -> None: with self.assertRaises(ValueError): partitioner.load_partition(0) - # @parameterized.expand( # type: ignore - # [ - # # num_partitions, num_rows, partition_index - # (10, 10, 10), - # (10, 10, -1), - # (10, 10, 11), - # (10, 100, 1000), - # (5, 50, 60), - # (20, 200, 210), - # ] - # ) - # def test_load_invalid_partition_index( - # self, num_partitions: int, num_rows: int, partition_index: int - # ) -> None: - # """Test loading a partition with an index out of range.""" - # _, partitioner = _dummy_setup(num_partitions, num_rows) - # with self.assertRaises(ValueError): - # partitioner.load_partition(partition_index) - # - # def test_is_dataset_assigned_false(self) -> None: - # """Test if the is_dataset_assigned method works correctly if not assigned.""" - # partitioner = IidPartitioner(num_partitions=10) - # - # # Initially, the dataset should not be assigned - # self.assertFalse(partitioner.is_dataset_assigned()) - # - # def test_is_dataset_assigned_true(self) -> None: - # """Test if the is_dataset_assigned method works correctly if assigned.""" - # num_partitions = 10 - # num_rows = 100 - # _, partitioner = _dummy_setup(num_partitions, num_rows) - # self.assertTrue(partitioner.is_dataset_assigned()) - # - # def test_dataset_setter(self) -> None: - # """Test the dataset.setter method.""" - # num_partitions = 10 - # num_rows = 100 - # dataset, partitioner = _dummy_setup(num_partitions, num_rows) - # - # # It should not allow setting the dataset a second time - # with self.assertRaises(Exception) as context: - # partitioner.dataset = dataset - # self.assertIn( - # "The dataset should be assigned only once", str(context.exception) - # ) - # - # def test_dataset_getter_raises(self) -> None: - # """Test the dataset getter method.""" - # num_partitions = 10 - # partitioner = IidPartitioner(num_partitions=num_partitions) - # with self.assertRaises(AttributeError) as context: - # _ = partitioner.dataset - # self.assertIn( - # "The dataset field should be set before using it", str(context.exception) - # ) - # - # def test_dataset_getter_used_correctly(self) -> None: - # """Test the dataset getter method.""" - # num_partitions = 10 - # num_rows = 100 - # dataset, partitioner = _dummy_setup(num_partitions, num_rows) - # # After setting, it should return the dataset - # self.assertEqual(partitioner.dataset, dataset) + @parameterized.expand( # type: ignore + # num_rows, num_unique_cids + list(itertools.product([10, 30, 100, 1000], [2, 3, 4, 5])) + ) + def test_correct_number_of_partitions( + self, num_rows: int, num_unique_cid: int + ) -> None: + """Test if the # of available partitions is equal to # of unique clients.""" + dataset, partitioner = _dummy_setup(num_rows, num_unique_cid) + _ = partitioner.load_partition(idx=0) + self.assertEqual(len(partitioner.index_to_cid), num_unique_cid) + + def test_cannot_set_index_to_cid(self): + dataset, partitioner = _dummy_setup(num_rows=10, n_unique_cids=2) + with self.assertRaises(AttributeError): + partitioner.index_to_cid = {0: "0"} if __name__ == "__main__": From cf8b67ff66f52e1e8eb989b548369608e1aa28b8 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 25 Sep 2023 14:35:44 +0200 Subject: [PATCH 04/10] Add a missing docstring --- datasets/flwr_datasets/partitioner/cid_partitioner_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py index 2c370143e9a2..7363bdf555be 100644 --- a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py @@ -105,6 +105,7 @@ def test_correct_number_of_partitions( self.assertEqual(len(partitioner.index_to_cid), num_unique_cid) def test_cannot_set_index_to_cid(self): + """Test the lack of ability to set index_to_cid.""" dataset, partitioner = _dummy_setup(num_rows=10, n_unique_cids=2) with self.assertRaises(AttributeError): partitioner.index_to_cid = {0: "0"} From 809269307493b5b93d57d9e347347d9381f99475 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 25 Sep 2023 14:45:03 +0200 Subject: [PATCH 05/10] Fix tests --- datasets/flwr_datasets/partitioner/cid_partitioner.py | 6 +++--- datasets/flwr_datasets/partitioner/cid_partitioner_test.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner.py b/datasets/flwr_datasets/partitioner/cid_partitioner.py index 3cb6b176cf04..d913ad46b9bd 100644 --- a/datasets/flwr_datasets/partitioner/cid_partitioner.py +++ b/datasets/flwr_datasets/partitioner/cid_partitioner.py @@ -20,7 +20,7 @@ class CidPartitioner(Partitioner): - """Partitioner for dataset that can be divided by reference to clients.""" + """Partitioner for dataset that can be divided by a reference to clients.""" def __init__( self, @@ -31,7 +31,7 @@ def __init__( self._partition_by = partition_by self._idx_to_rows: Dict[int, List[int]] = {} - def _create_int_idx_to_cid(self): + def _create_int_idx_to_cid(self) -> None: """Create a mapping from int indices to unique client ids. Client ids come from the columns specified in `partition_by`. @@ -39,7 +39,7 @@ def _create_int_idx_to_cid(self): unique_cid = self._dataset.unique(self._partition_by) self._index_to_cid = dict(zip(range(len(unique_cid)), unique_cid)) - def _save_partition_indexing(self, idx: int, rows: List[int]): + def _save_partition_indexing(self, idx: int, rows: List[int]) -> None: """Store the rows corresponding to the partition of idx. It should be used only after the `load_partition` is used. diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py index 7363bdf555be..83db1c72b1f5 100644 --- a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py @@ -35,7 +35,7 @@ def _dummy_setup(num_rows: int, n_unique_cids: int) -> Tuple[Dataset, CidPartiti return dataset, partitioner -def _create_dataset(num_rows: int, n_unique_cids: int): +def _create_dataset(num_rows: int, n_unique_cids: int) -> Dataset: """Create dataset based on the number of rows and unique cids.""" data = { "features": list(range(num_rows)), @@ -104,7 +104,7 @@ def test_correct_number_of_partitions( _ = partitioner.load_partition(idx=0) self.assertEqual(len(partitioner.index_to_cid), num_unique_cid) - def test_cannot_set_index_to_cid(self): + def test_cannot_set_index_to_cid(self) -> None: """Test the lack of ability to set index_to_cid.""" dataset, partitioner = _dummy_setup(num_rows=10, n_unique_cids=2) with self.assertRaises(AttributeError): From 821e26d578e9f71e9655488270a60274ec1cfb54 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 25 Sep 2023 14:49:30 +0200 Subject: [PATCH 06/10] Fix tests --- datasets/flwr_datasets/partitioner/cid_partitioner.py | 5 +++-- datasets/flwr_datasets/partitioner/cid_partitioner_test.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner.py b/datasets/flwr_datasets/partitioner/cid_partitioner.py index d913ad46b9bd..98ec9b8495c5 100644 --- a/datasets/flwr_datasets/partitioner/cid_partitioner.py +++ b/datasets/flwr_datasets/partitioner/cid_partitioner.py @@ -36,7 +36,7 @@ def _create_int_idx_to_cid(self) -> None: Client ids come from the columns specified in `partition_by`. """ - unique_cid = self._dataset.unique(self._partition_by) + unique_cid = self.dataset.unique(self._partition_by) self._index_to_cid = dict(zip(range(len(unique_cid)), unique_cid)) def _save_partition_indexing(self, idx: int, rows: List[int]) -> None: @@ -64,7 +64,7 @@ def load_partition(self, idx: int) -> datasets.Dataset: if len(self._index_to_cid) == 0: self._create_int_idx_to_cid() - return self._dataset.filter( + return self.dataset.filter( lambda row: row[self._partition_by] == self._index_to_cid[idx] ) @@ -73,6 +73,7 @@ def index_to_cid(self) -> Dict[int, str]: """Index to corresponding cid from the dataset property.""" return self._index_to_cid + # pylint: disable=R0201 @index_to_cid.setter def index_to_cid(self, value: Dict[int, str]) -> None: raise AttributeError("Setting the index_to_cid dictionary is not allowed.") diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py index 83db1c72b1f5..388d6d10b82f 100644 --- a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/cid_partitioner_test.py @@ -100,13 +100,13 @@ def test_correct_number_of_partitions( self, num_rows: int, num_unique_cid: int ) -> None: """Test if the # of available partitions is equal to # of unique clients.""" - dataset, partitioner = _dummy_setup(num_rows, num_unique_cid) + _, partitioner = _dummy_setup(num_rows, num_unique_cid) _ = partitioner.load_partition(idx=0) self.assertEqual(len(partitioner.index_to_cid), num_unique_cid) def test_cannot_set_index_to_cid(self) -> None: """Test the lack of ability to set index_to_cid.""" - dataset, partitioner = _dummy_setup(num_rows=10, n_unique_cids=2) + _, partitioner = _dummy_setup(num_rows=10, n_unique_cids=2) with self.assertRaises(AttributeError): partitioner.index_to_cid = {0: "0"} From 7e9288cf2b65f765d137182050d4d2572fd1adfb Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 25 Sep 2023 15:12:04 +0200 Subject: [PATCH 07/10] Remove unused parts --- datasets/flwr_datasets/partitioner/cid_partitioner.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner.py b/datasets/flwr_datasets/partitioner/cid_partitioner.py index 98ec9b8495c5..d39931845dc3 100644 --- a/datasets/flwr_datasets/partitioner/cid_partitioner.py +++ b/datasets/flwr_datasets/partitioner/cid_partitioner.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================== """Cid partitioner class that works with Hugging Face Datasets.""" -from typing import Dict, List +from typing import Dict import datasets from flwr_datasets.partitioner.partitioner import Partitioner @@ -29,7 +29,6 @@ def __init__( super().__init__() self._index_to_cid: Dict[int, str] = {} self._partition_by = partition_by - self._idx_to_rows: Dict[int, List[int]] = {} def _create_int_idx_to_cid(self) -> None: """Create a mapping from int indices to unique client ids. @@ -39,13 +38,6 @@ def _create_int_idx_to_cid(self) -> None: unique_cid = self.dataset.unique(self._partition_by) self._index_to_cid = dict(zip(range(len(unique_cid)), unique_cid)) - def _save_partition_indexing(self, idx: int, rows: List[int]) -> None: - """Store the rows corresponding to the partition of idx. - - It should be used only after the `load_partition` is used. - """ - self._idx_to_rows[idx] = rows - def load_partition(self, idx: int) -> datasets.Dataset: """Load a single partition corresponding to a single CID. From 57416f886ac6334477b82f790bcb2335bab5af36 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 7 Nov 2023 12:22:01 +0100 Subject: [PATCH 08/10] Rename cid to natural_id --- .../flwr_datasets/partitioner/__init__.py | 4 +- ...rtitioner.py => natural_id_partitioner.py} | 38 ++++++++++--------- ...test.py => natural_id_partitioner_test.py} | 20 +++++----- 3 files changed, 34 insertions(+), 28 deletions(-) rename datasets/flwr_datasets/partitioner/{cid_partitioner.py => natural_id_partitioner.py} (62%) rename datasets/flwr_datasets/partitioner/{cid_partitioner_test.py => natural_id_partitioner_test.py} (85%) diff --git a/datasets/flwr_datasets/partitioner/__init__.py b/datasets/flwr_datasets/partitioner/__init__.py index acc22a480348..70cdfb4ae434 100644 --- a/datasets/flwr_datasets/partitioner/__init__.py +++ b/datasets/flwr_datasets/partitioner/__init__.py @@ -15,8 +15,8 @@ """Flower Datasets Partitioner package.""" -from .cid_partitioner import CidPartitioner from .iid_partitioner import IidPartitioner +from .natural_id_partitioner import NaturalIdPartitioner from .partitioner import Partitioner -__all__ = ["IidPartitioner", "Partitioner", "CidPartitioner"] +__all__ = ["IidPartitioner", "Partitioner", "NaturalIdPartitioner"] diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py similarity index 62% rename from datasets/flwr_datasets/partitioner/cid_partitioner.py rename to datasets/flwr_datasets/partitioner/natural_id_partitioner.py index d39931845dc3..7493d2ac911e 100644 --- a/datasets/flwr_datasets/partitioner/cid_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -12,31 +12,33 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Cid partitioner class that works with Hugging Face Datasets.""" +"""Natural id partitioner class that works with Hugging Face Datasets.""" from typing import Dict import datasets from flwr_datasets.partitioner.partitioner import Partitioner -class CidPartitioner(Partitioner): - """Partitioner for dataset that can be divided by a reference to clients.""" +class NaturalIdPartitioner(Partitioner): + """Partitioner for dataset that can be divided by a reference to id in dataset.""" def __init__( self, partition_by: str, ): super().__init__() - self._index_to_cid: Dict[int, str] = {} + self._node_id_to_natural_id: Dict[int, str] = {} self._partition_by = partition_by - def _create_int_idx_to_cid(self) -> None: - """Create a mapping from int indices to unique client ids. + def _create_int_node_id_to_natural_id(self) -> None: + """Create a mapping from int indices to unique client ids from dataset. - Client ids come from the columns specified in `partition_by`. + Natural ids come from the column specified in `partition_by`. """ - unique_cid = self.dataset.unique(self._partition_by) - self._index_to_cid = dict(zip(range(len(unique_cid)), unique_cid)) + unique_natural_ids = self.dataset.unique(self._partition_by) + self._node_id_to_natural_id = dict( + zip(range(len(unique_natural_ids)), unique_natural_ids) + ) def load_partition(self, idx: int) -> datasets.Dataset: """Load a single partition corresponding to a single CID. @@ -53,19 +55,21 @@ def load_partition(self, idx: int) -> datasets.Dataset: dataset_partition: Dataset single dataset partition """ - if len(self._index_to_cid) == 0: - self._create_int_idx_to_cid() + if len(self._node_id_to_natural_id) == 0: + self._create_int_node_id_to_natural_id() return self.dataset.filter( - lambda row: row[self._partition_by] == self._index_to_cid[idx] + lambda row: row[self._partition_by] == self._node_id_to_natural_id[idx] ) @property - def index_to_cid(self) -> Dict[int, str]: + def node_id_to_natural_id(self) -> Dict[int, str]: """Index to corresponding cid from the dataset property.""" - return self._index_to_cid + return self._node_id_to_natural_id # pylint: disable=R0201 - @index_to_cid.setter - def index_to_cid(self, value: Dict[int, str]) -> None: - raise AttributeError("Setting the index_to_cid dictionary is not allowed.") + @node_id_to_natural_id.setter + def node_id_to_natural_id(self, value: Dict[int, str]) -> None: + raise AttributeError( + "Setting the node_id_to_natural_id dictionary is not allowed." + ) diff --git a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py similarity index 85% rename from datasets/flwr_datasets/partitioner/cid_partitioner_test.py rename to datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py index 388d6d10b82f..6118423c796d 100644 --- a/datasets/flwr_datasets/partitioner/cid_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py @@ -21,16 +21,18 @@ from parameterized import parameterized from datasets import Dataset -from flwr_datasets.partitioner.cid_partitioner import CidPartitioner +from flwr_datasets.partitioner.natural_id_partitioner import NaturalIdPartitioner -def _dummy_setup(num_rows: int, n_unique_cids: int) -> Tuple[Dataset, CidPartitioner]: +def _dummy_setup( + num_rows: int, n_unique_cids: int +) -> Tuple[Dataset, NaturalIdPartitioner]: """Create a dummy dataset and partitioner based on given arguments. The partitioner has automatically the dataset assigned to it. """ dataset = _create_dataset(num_rows, n_unique_cids) - partitioner = CidPartitioner(partition_by="cid") + partitioner = NaturalIdPartitioner(partition_by="cid") partitioner.dataset = dataset return dataset, partitioner @@ -61,9 +63,9 @@ def test_load_partition_num_partitions( Only the correct data is tested in this method. """ _, partitioner = _dummy_setup(num_rows, num_unique_cid) - # Simulate usage to start lazy index_to_cid creation + # Simulate usage to start lazy node_id_to_natural_id creation _ = partitioner.load_partition(0) - self.assertEqual(len(partitioner.index_to_cid), num_unique_cid) + self.assertEqual(len(partitioner.node_id_to_natural_id), num_unique_cid) @parameterized.expand( # type: ignore # num_rows, num_unique_cids @@ -87,7 +89,7 @@ def test_load_partition_max_partition_size( def test_partitioner_with_non_existing_column_partition_by(self) -> None: """Test error when the partition_by columns does not exist.""" dataset = _create_dataset(10, 2) - partitioner = CidPartitioner(partition_by="not-cid") + partitioner = NaturalIdPartitioner(partition_by="not-cid") partitioner.dataset = dataset with self.assertRaises(ValueError): partitioner.load_partition(0) @@ -102,13 +104,13 @@ def test_correct_number_of_partitions( """Test if the # of available partitions is equal to # of unique clients.""" _, partitioner = _dummy_setup(num_rows, num_unique_cid) _ = partitioner.load_partition(idx=0) - self.assertEqual(len(partitioner.index_to_cid), num_unique_cid) + self.assertEqual(len(partitioner.node_id_to_natural_id), num_unique_cid) def test_cannot_set_index_to_cid(self) -> None: - """Test the lack of ability to set index_to_cid.""" + """Test the lack of ability to set node_id_to_natural_id.""" _, partitioner = _dummy_setup(num_rows=10, n_unique_cids=2) with self.assertRaises(AttributeError): - partitioner.index_to_cid = {0: "0"} + partitioner.node_id_to_natural_id = {0: "0"} if __name__ == "__main__": From 8b53ba4df50571ce2a116f5e6c9e16a509912865 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 7 Nov 2023 13:45:30 +0100 Subject: [PATCH 09/10] Apply code review suggestions, rename cid to natural id --- .../flwr_datasets/partitioner/__init__.py | 6 +- .../partitioner/natural_id_partitioner.py | 12 +++- .../natural_id_partitioner_test.py | 56 ++++++++++--------- 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/__init__.py b/datasets/flwr_datasets/partitioner/__init__.py index 70cdfb4ae434..e5ff0d7c35c9 100644 --- a/datasets/flwr_datasets/partitioner/__init__.py +++ b/datasets/flwr_datasets/partitioner/__init__.py @@ -19,4 +19,8 @@ from .natural_id_partitioner import NaturalIdPartitioner from .partitioner import Partitioner -__all__ = ["IidPartitioner", "Partitioner", "NaturalIdPartitioner"] +__all__ = [ + "IidPartitioner", + "Partitioner", + "NaturalIdPartitioner", +] diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 7493d2ac911e..9591cf911139 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -13,6 +13,8 @@ # limitations under the License. # ============================================================================== """Natural id partitioner class that works with Hugging Face Datasets.""" + + from typing import Dict import datasets @@ -41,9 +43,10 @@ def _create_int_node_id_to_natural_id(self) -> None: ) def load_partition(self, idx: int) -> datasets.Dataset: - """Load a single partition corresponding to a single CID. + """Load a single partition corresponding to a single `node_id`. - The choice of the partition is based on unique integers assigned to each cid. + The choice of the partition is based on unique integers assigned to each + natural id present in the dataset in the `partition_by` column. Parameters ---------- @@ -64,7 +67,10 @@ def load_partition(self, idx: int) -> datasets.Dataset: @property def node_id_to_natural_id(self) -> Dict[int, str]: - """Index to corresponding cid from the dataset property.""" + """Node id to corresponding natural id present. + + Natural ids are the unique values in `partition_by` column in dataset. + """ return self._node_id_to_natural_id # pylint: disable=R0201 diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py index 6118423c796d..4e382b77095c 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""CID partitioner tests.""" +"""NaturalIdPartitioner partitioner tests.""" + + import itertools import math import unittest @@ -25,90 +27,90 @@ def _dummy_setup( - num_rows: int, n_unique_cids: int + num_rows: int, n_unique_natural_ids: int ) -> Tuple[Dataset, NaturalIdPartitioner]: """Create a dummy dataset and partitioner based on given arguments. The partitioner has automatically the dataset assigned to it. """ - dataset = _create_dataset(num_rows, n_unique_cids) - partitioner = NaturalIdPartitioner(partition_by="cid") + dataset = _create_dataset(num_rows, n_unique_natural_ids) + partitioner = NaturalIdPartitioner(partition_by="natural_id") partitioner.dataset = dataset return dataset, partitioner -def _create_dataset(num_rows: int, n_unique_cids: int) -> Dataset: - """Create dataset based on the number of rows and unique cids.""" +def _create_dataset(num_rows: int, n_unique_natural_ids: int) -> Dataset: + """Create dataset based on the number of rows and unique natural ids.""" data = { "features": list(range(num_rows)), - "cid": [f"{i % n_unique_cids}" for i in range(num_rows)], + "natural_id": [f"{i % n_unique_natural_ids}" for i in range(num_rows)], "labels": [i % 2 for i in range(num_rows)], } dataset = Dataset.from_dict(data) return dataset -class TestCidPartitioner(unittest.TestCase): +class TestNaturalIdPartitioner(unittest.TestCase): """Test IidPartitioner.""" @parameterized.expand( # type: ignore - # num_rows, num_unique_cids + # num_rows, num_unique_natural_ids list(itertools.product([10, 30, 100, 1000], [2, 3, 4, 5])) ) def test_load_partition_num_partitions( - self, num_rows: int, num_unique_cid: int + self, num_rows: int, num_unique_natural_id: int ) -> None: - """Test if the number of partitions match the number of unique cids. + """Test if the number of partitions match the number of unique natural ids. Only the correct data is tested in this method. """ - _, partitioner = _dummy_setup(num_rows, num_unique_cid) + _, partitioner = _dummy_setup(num_rows, num_unique_natural_id) # Simulate usage to start lazy node_id_to_natural_id creation _ = partitioner.load_partition(0) - self.assertEqual(len(partitioner.node_id_to_natural_id), num_unique_cid) + self.assertEqual(len(partitioner.node_id_to_natural_id), num_unique_natural_id) @parameterized.expand( # type: ignore - # num_rows, num_unique_cids + # num_rows, num_unique_natural_ids list(itertools.product([10, 30, 100, 1000], [2, 3, 4, 5])) ) def test_load_partition_max_partition_size( - self, num_rows: int, num_unique_cid: int + self, num_rows: int, num_unique_natural_ids: int ) -> None: - """Test if the number of partitions match the number of unique cids. + """Test if the number of partitions match the number of unique natural ids. Only the correct data is tested in this method. """ print(num_rows) - print(num_unique_cid) - _, partitioner = _dummy_setup(num_rows, num_unique_cid) + print(num_unique_natural_ids) + _, partitioner = _dummy_setup(num_rows, num_unique_natural_ids) max_size = max( - [len(partitioner.load_partition(i)) for i in range(num_unique_cid)] + [len(partitioner.load_partition(i)) for i in range(num_unique_natural_ids)] ) - self.assertEqual(max_size, math.ceil(num_rows / num_unique_cid)) + self.assertEqual(max_size, math.ceil(num_rows / num_unique_natural_ids)) def test_partitioner_with_non_existing_column_partition_by(self) -> None: """Test error when the partition_by columns does not exist.""" dataset = _create_dataset(10, 2) - partitioner = NaturalIdPartitioner(partition_by="not-cid") + partitioner = NaturalIdPartitioner(partition_by="not-existing") partitioner.dataset = dataset with self.assertRaises(ValueError): partitioner.load_partition(0) @parameterized.expand( # type: ignore - # num_rows, num_unique_cids + # num_rows, num_unique_natural_ids list(itertools.product([10, 30, 100, 1000], [2, 3, 4, 5])) ) def test_correct_number_of_partitions( - self, num_rows: int, num_unique_cid: int + self, num_rows: int, num_unique_natural_ids: int ) -> None: """Test if the # of available partitions is equal to # of unique clients.""" - _, partitioner = _dummy_setup(num_rows, num_unique_cid) + _, partitioner = _dummy_setup(num_rows, num_unique_natural_ids) _ = partitioner.load_partition(idx=0) - self.assertEqual(len(partitioner.node_id_to_natural_id), num_unique_cid) + self.assertEqual(len(partitioner.node_id_to_natural_id), num_unique_natural_ids) - def test_cannot_set_index_to_cid(self) -> None: + def test_cannot_set_node_id_to_natural_id(self) -> None: """Test the lack of ability to set node_id_to_natural_id.""" - _, partitioner = _dummy_setup(num_rows=10, n_unique_cids=2) + _, partitioner = _dummy_setup(num_rows=10, n_unique_natural_ids=2) with self.assertRaises(AttributeError): partitioner.node_id_to_natural_id = {0: "0"} From 452ef664cbebf95b6934a9eec3b74e3bf20b4d57 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 9 Nov 2023 14:22:18 +0100 Subject: [PATCH 10/10] Fix formatting --- datasets/flwr_datasets/partitioner/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/__init__.py b/datasets/flwr_datasets/partitioner/__init__.py index 4bfb0cb10dc3..5e7c86718f67 100644 --- a/datasets/flwr_datasets/partitioner/__init__.py +++ b/datasets/flwr_datasets/partitioner/__init__.py @@ -17,8 +17,8 @@ from .exponential_partitioner import ExponentialPartitioner from .iid_partitioner import IidPartitioner -from .natural_id_partitioner import NaturalIdPartitioner from .linear_partitioner import LinearPartitioner +from .natural_id_partitioner import NaturalIdPartitioner from .partitioner import Partitioner from .size_partitioner import SizePartitioner from .square_partitioner import SquarePartitioner