From 04347b296dfbb520e3c713f4d40bbb622a93c0aa Mon Sep 17 00:00:00 2001 From: Charles Beauville Date: Tue, 14 Nov 2023 19:47:55 +0100 Subject: [PATCH 01/64] Delete node locally in gRPC-rere (#2596) --- src/py/flwr/client/grpc_rere_client/connection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/py/flwr/client/grpc_rere_client/connection.py b/src/py/flwr/client/grpc_rere_client/connection.py index 3dcc147e8eca..b69228826e13 100644 --- a/src/py/flwr/client/grpc_rere_client/connection.py +++ b/src/py/flwr/client/grpc_rere_client/connection.py @@ -136,6 +136,8 @@ def delete_node() -> None: delete_node_request = DeleteNodeRequest(node=node) stub.DeleteNode(request=delete_node_request) + del node_store[KEY_NODE] + def receive() -> Optional[TaskIns]: """Receive next task from server.""" # Get Node From e2116b051ff852b340e2a7913a969e551f020145 Mon Sep 17 00:00:00 2001 From: Charles Beauville Date: Tue, 14 Nov 2023 19:52:52 +0100 Subject: [PATCH 02/64] C++ SDK: Delete local node for gRPC-rere (#2597) --- src/cc/flwr/src/grpc_rere.cc | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/cc/flwr/src/grpc_rere.cc b/src/cc/flwr/src/grpc_rere.cc index c4920e986b1b..267874a7a0e2 100644 --- a/src/cc/flwr/src/grpc_rere.cc +++ b/src/cc/flwr/src/grpc_rere.cc @@ -19,6 +19,14 @@ std::optional get_node_from_store() { return node->second; } +void delete_node_from_store() { + std::lock_guard lock(node_store_mutex); + auto node = node_store.find(KEY_NODE); + if (node == node_store.end() || !node->second.has_value()) { + node_store.erase(node); + } +} + std::optional get_current_task_ins() { std::lock_guard state_lock(state_mutex); auto current_task_ins = state.find(KEY_TASK_INS); @@ -80,8 +88,7 @@ void delete_node(const std::unique_ptr &stub) { delete_node_request.release_node(); // Release if status is ok } - // TODO: Check if Node needs to be removed from local map - // node_store.erase(node); + delete_node_from_store(); } std::optional From db38b94d09d0d77e96fe99ecbe57db2a9999738a Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 15 Nov 2023 10:46:14 +0100 Subject: [PATCH 03/64] Change the settings for IidPartitioner (#2589) --- datasets/flwr_datasets/partitioner/iid_partitioner.py | 2 +- .../flwr_datasets/partitioner/iid_partitioner_test.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/iid_partitioner.py b/datasets/flwr_datasets/partitioner/iid_partitioner.py index 37b97468cadf..c8dbf8294fec 100644 --- a/datasets/flwr_datasets/partitioner/iid_partitioner.py +++ b/datasets/flwr_datasets/partitioner/iid_partitioner.py @@ -48,5 +48,5 @@ def load_partition(self, idx: int) -> datasets.Dataset: single dataset partition """ return self.dataset.shard( - num_shards=self._num_partitions, index=idx, contiguous=False + num_shards=self._num_partitions, index=idx, contiguous=True ) diff --git a/datasets/flwr_datasets/partitioner/iid_partitioner_test.py b/datasets/flwr_datasets/partitioner/iid_partitioner_test.py index 5f851807f4bd..64c37c4e7127 100644 --- a/datasets/flwr_datasets/partitioner/iid_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/iid_partitioner_test.py @@ -18,7 +18,6 @@ import unittest from typing import Tuple -import numpy as np from parameterized import parameterized from datasets import Dataset @@ -102,14 +101,15 @@ def test_load_partition_correct_data( ) -> None: """Test if the data in partition is equal to the expected.""" dataset, partitioner = _dummy_setup(num_partitions, num_rows) + partition_size = num_rows // num_partitions partition_index = 2 partition = partitioner.load_partition(partition_index) row_id = 0 self.assertEqual( - partition["features"][row_id], - dataset[np.arange(partition_index, len(dataset), num_partitions)][ - "features" - ][row_id], + partition[row_id]["features"], + # Note it's contiguous so partition_size * partition_index gets the first + # element of the partition of partition_index + dataset[partition_size * partition_index + row_id]["features"], ) @parameterized.expand( # type: ignore From d00a579d01e4a254c871b6b0594b3690e0c92ad7 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 15 Nov 2023 11:49:38 +0100 Subject: [PATCH 04/64] Fds fix missing __all__ in inits (#2599) --- datasets/flwr_datasets/__init__.py | 10 ++++++--- datasets/flwr_datasets/resplitter/__init__.py | 22 +++++++++++++++++++ .../{ => resplitter}/merge_resplitter.py | 0 .../{ => resplitter}/merge_resplitter_test.py | 2 +- datasets/flwr_datasets/utils.py | 2 +- 5 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 datasets/flwr_datasets/resplitter/__init__.py rename datasets/flwr_datasets/{ => resplitter}/merge_resplitter.py (100%) rename datasets/flwr_datasets/{ => resplitter}/merge_resplitter_test.py (98%) diff --git a/datasets/flwr_datasets/__init__.py b/datasets/flwr_datasets/__init__.py index 89e651e0f558..48d993037708 100644 --- a/datasets/flwr_datasets/__init__.py +++ b/datasets/flwr_datasets/__init__.py @@ -15,10 +15,14 @@ """Flower Datasets main package.""" +from flwr_datasets import partitioner, resplitter from flwr_datasets.common.version import package_version as _package_version +from flwr_datasets.federated_dataset import FederatedDataset -from .federated_dataset import FederatedDataset - -__all__ = ["FederatedDataset"] +__all__ = [ + "FederatedDataset", + "partitioner", + "resplitter", +] __version__ = _package_version diff --git a/datasets/flwr_datasets/resplitter/__init__.py b/datasets/flwr_datasets/resplitter/__init__.py new file mode 100644 index 000000000000..f778d2096b76 --- /dev/null +++ b/datasets/flwr_datasets/resplitter/__init__.py @@ -0,0 +1,22 @@ +# Copyright 2023 Flower Labs GmbH. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Resplitter package.""" + + +from .merge_resplitter import MergeResplitter + +__all__ = [ + "MergeResplitter", +] diff --git a/datasets/flwr_datasets/merge_resplitter.py b/datasets/flwr_datasets/resplitter/merge_resplitter.py similarity index 100% rename from datasets/flwr_datasets/merge_resplitter.py rename to datasets/flwr_datasets/resplitter/merge_resplitter.py diff --git a/datasets/flwr_datasets/merge_resplitter_test.py b/datasets/flwr_datasets/resplitter/merge_resplitter_test.py similarity index 98% rename from datasets/flwr_datasets/merge_resplitter_test.py rename to datasets/flwr_datasets/resplitter/merge_resplitter_test.py index 096bd7efac27..ebbdfb4022b0 100644 --- a/datasets/flwr_datasets/merge_resplitter_test.py +++ b/datasets/flwr_datasets/resplitter/merge_resplitter_test.py @@ -21,7 +21,7 @@ import pytest from datasets import Dataset, DatasetDict -from flwr_datasets.merge_resplitter import MergeResplitter +from flwr_datasets.resplitter.merge_resplitter import MergeResplitter class TestResplitter(unittest.TestCase): diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py index 7badb1445460..49c65e9893a7 100644 --- a/datasets/flwr_datasets/utils.py +++ b/datasets/flwr_datasets/utils.py @@ -19,8 +19,8 @@ from typing import Dict, Optional, Tuple, Union, cast from flwr_datasets.common import Resplitter -from flwr_datasets.merge_resplitter import MergeResplitter from flwr_datasets.partitioner import IidPartitioner, Partitioner +from flwr_datasets.resplitter.merge_resplitter import MergeResplitter tested_datasets = [ "mnist", From 04e2ea3f085b8eb005f8ec4e4a6879fd52de97ed Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 15 Nov 2023 13:52:32 +0100 Subject: [PATCH 05/64] Add shuffle seed parameters to FederatedDataset (#2588) --- datasets/flwr_datasets/federated_dataset.py | 86 +++++++++++------- .../flwr_datasets/federated_dataset_test.py | 89 ++++++++++++++++++- 2 files changed, 144 insertions(+), 31 deletions(-) diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py index 6e4e9ca43ccb..b1e61f1f9231 100644 --- a/datasets/flwr_datasets/federated_dataset.py +++ b/datasets/flwr_datasets/federated_dataset.py @@ -38,18 +38,25 @@ class FederatedDataset: Parameters ---------- - dataset: str + dataset : str The name of the dataset in the Hugging Face Hub. - subset: str + subset : str Secondary information regarding the dataset, most often subset or version (that is passed to the name in datasets.load_dataset). - resplitter: Optional[Union[Resplitter, Dict[str, Tuple[str, ...]]]] + resplitter : Optional[Union[Resplitter, Dict[str, Tuple[str, ...]]]] `Callable` that transforms `DatasetDict` splits, or configuration dict for `MergeResplitter`. - partitioners: Dict[str, Union[Partitioner, int]] + partitioners : Dict[str, Union[Partitioner, int]] A dictionary mapping the Dataset split (a `str`) to a `Partitioner` or an `int` (representing the number of IID partitions that this split should be partitioned into). + shuffle : bool + Whether to randomize the order of samples. Applied prior to resplitting, + speratelly to each of the present splits in the dataset. It uses the `seed` + argument. Defaults to True. + seed : Optional[int] + Seed used for dataset shuffling. It has no effect if `shuffle` is False. The + seed cannot be set in the later stages. Examples -------- @@ -66,6 +73,7 @@ class FederatedDataset: >>> centralized = mnist_fds.load_full("test") """ + # pylint: disable=too-many-instance-attributes def __init__( self, *, @@ -73,6 +81,8 @@ def __init__( subset: Optional[str] = None, resplitter: Optional[Union[Resplitter, Dict[str, Tuple[str, ...]]]] = None, partitioners: Dict[str, Union[Partitioner, int]], + shuffle: bool = True, + seed: Optional[int] = 42, ) -> None: _check_if_dataset_tested(dataset) self._dataset_name: str = dataset @@ -83,9 +93,13 @@ def __init__( self._partitioners: Dict[str, Partitioner] = _instantiate_partitioners( partitioners ) - # Init (download) lazily on the first call to `load_partition` or `load_full` + self._shuffle = shuffle + self._seed = seed + # _dataset is prepared lazily on the first call to `load_partition` + # or `load_full`. See _prepare_datasets for more details self._dataset: Optional[DatasetDict] = None - self._resplit: bool = False # Indicate if the resplit happened + # Indicate if the dataset is prepared for `load_partition` or `load_full` + self._dataset_prepared: bool = False def load_partition(self, idx: int, split: str) -> Dataset: """Load the partition specified by the idx in the selected split. @@ -95,9 +109,9 @@ def load_partition(self, idx: int, split: str) -> Dataset: Parameters ---------- - idx: int + idx : int Partition index for the selected split, idx in {0, ..., num_partitions - 1}. - split: str + split : str Name of the (partitioned) split (e.g. "train", "test"). Returns @@ -105,8 +119,8 @@ def load_partition(self, idx: int, split: str) -> Dataset: partition: Dataset Single partition from the dataset split. """ - self._download_dataset_if_none() - self._resplit_dataset_if_needed() + if not self._dataset_prepared: + self._prepare_dataset() if self._dataset is None: raise ValueError("Dataset is not loaded yet.") self._check_if_split_present(split) @@ -123,7 +137,7 @@ def load_full(self, split: str) -> Dataset: Parameters ---------- - split: str + split : str Split name of the downloaded dataset (e.g. "train", "test"). Returns @@ -131,20 +145,13 @@ def load_full(self, split: str) -> Dataset: dataset_split: Dataset Part of the dataset identified by its split name. """ - self._download_dataset_if_none() - self._resplit_dataset_if_needed() + if not self._dataset_prepared: + self._prepare_dataset() if self._dataset is None: raise ValueError("Dataset is not loaded yet.") self._check_if_split_present(split) return self._dataset[split] - def _download_dataset_if_none(self) -> None: - """Lazily load (and potentially download) the Dataset instance into memory.""" - if self._dataset is None: - self._dataset = datasets.load_dataset( - path=self._dataset_name, name=self._subset - ) - def _check_if_split_present(self, split: str) -> None: """Check if the split (for partitioning or full return) is in the dataset.""" if self._dataset is None: @@ -176,15 +183,34 @@ def _assign_dataset_to_partitioner(self, split: str) -> None: if not self._partitioners[split].is_dataset_assigned(): self._partitioners[split].dataset = self._dataset[split] - def _resplit_dataset_if_needed(self) -> None: - # The actual re-splitting can't be done more than once. - # The attribute `_resplit` indicates that the resplit happened. - - # Resplit only once - if self._resplit: - return - if self._dataset is None: - raise ValueError("The dataset resplit should happen after the download.") + def _prepare_dataset(self) -> None: + """Prepare the dataset (prior to partitioning) by download, shuffle, replit. + + Run only ONCE when triggered by load_* function. (In future more control whether + this should happen lazily or not can be added). The operations done here should + not happen more than once. + + It is controlled by a single flag, `_dataset_prepared` that is set True at the + end of the function. + + Notes + ----- + The shuffling should happen before the resplitting. Here is the explanation. + If the dataset has a non-random order of samples e.g. each split has first + only label 0, then only label 1. Then in case of resplitting e.g. + someone creates: "train" train[:int(0.75 * len(train))], test: concat( + train[int(0.75 * len(train)):], test). The new test took the 0.25 of e.g. + the train that is only label 0 (assuming the equal count of labels). + Therefore, for such edge cases (for which we have split) the split should + happen before the resplitting. + """ + self._dataset = datasets.load_dataset( + path=self._dataset_name, name=self._subset + ) + if self._shuffle: + # Note it shuffles all the splits. The self._dataset is DatasetDict + # so e.g. {"train": train_data, "test": test_data}. All splits get shuffled. + self._dataset = self._dataset.shuffle(seed=self._seed) if self._resplitter: self._dataset = self._resplitter(self._dataset) - self._resplit = True + self._dataset_prepared = True diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py index ca2bc97a33ee..1e36fd565d06 100644 --- a/datasets/flwr_datasets/federated_dataset_test.py +++ b/datasets/flwr_datasets/federated_dataset_test.py @@ -18,12 +18,13 @@ import unittest from typing import Dict, Union +from unittest.mock import Mock, patch import pytest from parameterized import parameterized, parameterized_class import datasets -from datasets import DatasetDict, concatenate_datasets +from datasets import Dataset, DatasetDict, concatenate_datasets from flwr_datasets.federated_dataset import FederatedDataset from flwr_datasets.partitioner import IidPartitioner, Partitioner @@ -144,6 +145,92 @@ def resplit(dataset: DatasetDict) -> DatasetDict: self.assertEqual(len(full), dataset_length) +class ArtificialDatasetTest(unittest.TestCase): + """Test using small artificial dataset, mocked load_dataset.""" + + # pylint: disable=no-self-use + def _dummy_setup(self, train_rows: int = 10, test_rows: int = 5) -> DatasetDict: + """Create a dummy DatasetDict with train, test splits.""" + data_train = { + "features": list(range(train_rows)), + "labels": list(range(100, 100 + train_rows)), + } + data_test = { + "features": [200] + [201] * (test_rows - 1), + "labels": [202] + [203] * (test_rows - 1), + } + train_dataset = Dataset.from_dict(data_train) + test_dataset = Dataset.from_dict(data_test) + return DatasetDict({"train": train_dataset, "test": test_dataset}) + + @patch("datasets.load_dataset") + def test_shuffling_applied(self, mock_func: Mock) -> None: + """Test if argument is used.""" + dummy_ds = self._dummy_setup() + mock_func.return_value = dummy_ds + + expected_result = dummy_ds.shuffle(seed=42)["train"]["features"] + fds = FederatedDataset( + dataset="does-not-matter", partitioners={"train": 10}, shuffle=True, seed=42 + ) + train = fds.load_full("train") + # This should be shuffled + result = train["features"] + + self.assertEqual(expected_result, result) + + @patch("datasets.load_dataset") + def test_shuffling_not_applied(self, mock_func: Mock) -> None: + """Test if argument is not used.""" + dummy_ds = self._dummy_setup() + mock_func.return_value = dummy_ds + + expected_result = dummy_ds["train"]["features"] + fds = FederatedDataset( + dataset="does-not-matter", + partitioners={"train": 10}, + shuffle=False, + ) + train = fds.load_full("train") + # This should not be shuffled + result = train["features"] + + self.assertEqual(expected_result, result) + + @patch("datasets.load_dataset") + def test_shuffling_before_to_resplitting_applied(self, mock_func: Mock) -> None: + """Check if the order is met and if the shuffling happens.""" + + def resplit(dataset: DatasetDict) -> DatasetDict: + # "Move" the last sample from test to train + return DatasetDict( + { + "train": concatenate_datasets( + [dataset["train"], dataset["test"].select([0])] + ), + "test": dataset["test"].select(range(1, dataset["test"].num_rows)), + } + ) + + dummy_ds = self._dummy_setup() + mock_func.return_value = dummy_ds + + expected_result = concatenate_datasets( + [dummy_ds["train"].shuffle(42), dummy_ds["test"].shuffle(42).select([0])] + )["features"] + fds = FederatedDataset( + dataset="does-not-matter", + partitioners={"train": 10}, + resplitter=resplit, + shuffle=True, + ) + train = fds.load_full("train") + # This should not be shuffled + result = train["features"] + + self.assertEqual(expected_result, result) + + class PartitionersSpecificationForFederatedDatasets(unittest.TestCase): """Test the specifications of partitioners for `FederatedDataset`.""" From 3c8975476e0201fe75a1aeed46a23cbb8897d1d1 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 15 Nov 2023 14:22:08 +0100 Subject: [PATCH 06/64] Make the split keyword optional for load_partition (#2423) --- datasets/flwr_datasets/federated_dataset.py | 25 ++++++++++++++---- .../flwr_datasets/federated_dataset_test.py | 26 +++++++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py index b1e61f1f9231..52acc4d40100 100644 --- a/datasets/flwr_datasets/federated_dataset.py +++ b/datasets/flwr_datasets/federated_dataset.py @@ -101,7 +101,7 @@ def __init__( # Indicate if the dataset is prepared for `load_partition` or `load_full` self._dataset_prepared: bool = False - def load_partition(self, idx: int, split: str) -> Dataset: + def load_partition(self, idx: int, split: Optional[str] = None) -> Dataset: """Load the partition specified by the idx in the selected split. The dataset is downloaded only when the first call to `load_partition` or @@ -111,18 +111,26 @@ def load_partition(self, idx: int, split: str) -> Dataset: ---------- idx : int Partition index for the selected split, idx in {0, ..., num_partitions - 1}. - split : str - Name of the (partitioned) split (e.g. "train", "test"). + split : Optional[str] + Name of the (partitioned) split (e.g. "train", "test"). You can skip this + parameter if there is only one partitioner for the dataset. The name will be + inferred automatically. For example, if `partitioners={"train": 10}`, you do + not need to provide this argument, but if `partitioners={"train": 10, + "test": 100}`, you need to set it to differentiate which partitioner should + be used. Returns ------- - partition: Dataset + partition : Dataset Single partition from the dataset split. """ if not self._dataset_prepared: self._prepare_dataset() if self._dataset is None: raise ValueError("Dataset is not loaded yet.") + if split is None: + self._check_if_no_split_keyword_possible() + split = list(self._partitioners.keys())[0] self._check_if_split_present(split) self._check_if_split_possible_to_federate(split) partitioner: Partitioner = self._partitioners[split] @@ -142,7 +150,7 @@ def load_full(self, split: str) -> Dataset: Returns ------- - dataset_split: Dataset + dataset_split : Dataset Part of the dataset identified by its split name. """ if not self._dataset_prepared: @@ -214,3 +222,10 @@ def _prepare_dataset(self) -> None: if self._resplitter: self._dataset = self._resplitter(self._dataset) self._dataset_prepared = True + + def _check_if_no_split_keyword_possible(self) -> None: + if len(self._partitioners) != 1: + raise ValueError( + "Please set the `split` argument. You can only omit the split keyword " + "if there is exactly one partitioner specified." + ) diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py index 1e36fd565d06..e02b6ed5add8 100644 --- a/datasets/flwr_datasets/federated_dataset_test.py +++ b/datasets/flwr_datasets/federated_dataset_test.py @@ -95,6 +95,18 @@ def test_multiple_partitioners(self) -> None: len(dataset[self.test_split]) // num_test_partitions, ) + def test_no_need_for_split_keyword_if_one_partitioner(self) -> None: + """Test if partitions got with and without split args are the same.""" + fds = FederatedDataset(dataset="mnist", partitioners={"train": 10}) + partition_loaded_with_no_split_arg = fds.load_partition(0) + partition_loaded_with_verbose_split_arg = fds.load_partition(0, "train") + self.assertTrue( + datasets_are_equal( + partition_loaded_with_no_split_arg, + partition_loaded_with_verbose_split_arg, + ) + ) + def test_resplit_dataset_into_one(self) -> None: """Test resplit into a single dataset.""" dataset = datasets.load_dataset(self.dataset_name) @@ -340,5 +352,19 @@ def test_cannot_use_the_old_split_names(self) -> None: fds.load_partition(0, "train") +def datasets_are_equal(ds1: Dataset, ds2: Dataset) -> bool: + """Check if two Datasets have the same values.""" + # Check if both datasets have the same length + if len(ds1) != len(ds2): + return False + + # Iterate over each row and check for equality + for row1, row2 in zip(ds1, ds2): + if row1 != row2: + return False + + return True + + if __name__ == "__main__": unittest.main() From 93b8a3dc2df6243dd9e395721a2a42f60516f8ae Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 15 Nov 2023 15:05:30 +0100 Subject: [PATCH 07/64] Rename idx to node_id (#2590) --- datasets/README.md | 2 +- datasets/flwr_datasets/federated_dataset.py | 6 +++--- .../flwr_datasets/partitioner/exponential_partitioner.py | 4 ++-- datasets/flwr_datasets/partitioner/iid_partitioner.py | 6 +++--- datasets/flwr_datasets/partitioner/linear_partitioner.py | 2 +- .../flwr_datasets/partitioner/natural_id_partitioner.py | 6 +++--- .../partitioner/natural_id_partitioner_test.py | 2 +- datasets/flwr_datasets/partitioner/partitioner.py | 4 ++-- datasets/flwr_datasets/partitioner/size_partitioner.py | 8 ++++---- datasets/flwr_datasets/partitioner/square_partitioner.py | 2 +- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/datasets/README.md b/datasets/README.md index 982da3760cc7..f4b3fab73d8e 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -53,7 +53,7 @@ If you plan to change the type of the dataset to run the code with your ML frame # Usage -The Flower Datasets exposes `FederatedDataset(dataset, partitioners)` abstraction to represent the dataset needed for federated learning/analytics. It has two powerful methods that let you handle the dataset preprocessing. They are `load_partition(idx, split)` and `load_full(split)`. +The Flower Datasets exposes `FederatedDataset(dataset, partitioners)` abstraction to represent the dataset needed for federated learning/analytics. It has two powerful methods that let you handle the dataset preprocessing. They are `load_partition(node_id, split)` and `load_full(split)`. Here's a quick example of how to partition the MNIST dataset: diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py index 52acc4d40100..cad137a98ba8 100644 --- a/datasets/flwr_datasets/federated_dataset.py +++ b/datasets/flwr_datasets/federated_dataset.py @@ -101,7 +101,7 @@ def __init__( # Indicate if the dataset is prepared for `load_partition` or `load_full` self._dataset_prepared: bool = False - def load_partition(self, idx: int, split: Optional[str] = None) -> Dataset: + def load_partition(self, node_id: int, split: Optional[str] = None) -> Dataset: """Load the partition specified by the idx in the selected split. The dataset is downloaded only when the first call to `load_partition` or @@ -109,7 +109,7 @@ def load_partition(self, idx: int, split: Optional[str] = None) -> Dataset: Parameters ---------- - idx : int + node_id : int Partition index for the selected split, idx in {0, ..., num_partitions - 1}. split : Optional[str] Name of the (partitioned) split (e.g. "train", "test"). You can skip this @@ -135,7 +135,7 @@ def load_partition(self, idx: int, split: Optional[str] = None) -> Dataset: self._check_if_split_possible_to_federate(split) partitioner: Partitioner = self._partitioners[split] self._assign_dataset_to_partitioner(split) - return partitioner.load_partition(idx) + return partitioner.load_partition(node_id) def load_full(self, split: str) -> Dataset: """Load the full split of the dataset. diff --git a/datasets/flwr_datasets/partitioner/exponential_partitioner.py b/datasets/flwr_datasets/partitioner/exponential_partitioner.py index 07bb07cea9a5..10b11eb3e126 100644 --- a/datasets/flwr_datasets/partitioner/exponential_partitioner.py +++ b/datasets/flwr_datasets/partitioner/exponential_partitioner.py @@ -21,7 +21,7 @@ class ExponentialPartitioner(SizePartitioner): - """Partitioner creates partitions of size that are correlated with exp(idx). + """Partitioner creates partitions of size that are correlated with exp(node_id). The amount of data each client gets is correlated with the exponent of partition ID. For instance, if the IDs range from 1 to M, client with ID 1 gets e units of @@ -29,7 +29,7 @@ class ExponentialPartitioner(SizePartitioner): The floor operation is applied on each of these numbers, it means floor(2.71...) = 2; e^2 ~ 7.39 floor(7.39) = 7. The number is rounded down = the fraction is always cut. The remainders of theses unassigned (fraction) samples is added to the - biggest partition (the one with the biggest idx). + biggest partition (the one with the biggest node_id). Parameters ---------- diff --git a/datasets/flwr_datasets/partitioner/iid_partitioner.py b/datasets/flwr_datasets/partitioner/iid_partitioner.py index c8dbf8294fec..2df0d7df1ebf 100644 --- a/datasets/flwr_datasets/partitioner/iid_partitioner.py +++ b/datasets/flwr_datasets/partitioner/iid_partitioner.py @@ -34,12 +34,12 @@ def __init__(self, num_partitions: int) -> None: raise ValueError("The number of partitions must be greater than zero.") self._num_partitions = num_partitions - def load_partition(self, idx: int) -> datasets.Dataset: + def load_partition(self, node_id: int) -> datasets.Dataset: """Load a single IID partition based on the partition index. Parameters ---------- - idx: int + node_id : int the index that corresponds to the requested partition Returns @@ -48,5 +48,5 @@ def load_partition(self, idx: int) -> datasets.Dataset: single dataset partition """ return self.dataset.shard( - num_shards=self._num_partitions, index=idx, contiguous=True + num_shards=self._num_partitions, index=node_id, contiguous=True ) diff --git a/datasets/flwr_datasets/partitioner/linear_partitioner.py b/datasets/flwr_datasets/partitioner/linear_partitioner.py index a9dc8b020c08..f77b0b87146d 100644 --- a/datasets/flwr_datasets/partitioner/linear_partitioner.py +++ b/datasets/flwr_datasets/partitioner/linear_partitioner.py @@ -19,7 +19,7 @@ class LinearPartitioner(SizePartitioner): - """Partitioner creates partitions of size that are linearly correlated with idx. + """Partitioner creates partitions of size that are linearly correlated with node_id. The amount of data each client gets is linearly correlated with the partition ID. For instance, if the IDs range from 1 to M, client with ID 1 gets 1 unit of data, diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 9591cf911139..bc6c19cd9cb3 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -42,7 +42,7 @@ def _create_int_node_id_to_natural_id(self) -> None: zip(range(len(unique_natural_ids)), unique_natural_ids) ) - def load_partition(self, idx: int) -> datasets.Dataset: + def load_partition(self, node_id: int) -> datasets.Dataset: """Load a single partition corresponding to a single `node_id`. The choice of the partition is based on unique integers assigned to each @@ -50,7 +50,7 @@ def load_partition(self, idx: int) -> datasets.Dataset: Parameters ---------- - idx: int + node_id : int the index that corresponds to the requested partition Returns @@ -62,7 +62,7 @@ def load_partition(self, idx: int) -> datasets.Dataset: self._create_int_node_id_to_natural_id() return self.dataset.filter( - lambda row: row[self._partition_by] == self._node_id_to_natural_id[idx] + lambda row: row[self._partition_by] == self._node_id_to_natural_id[node_id] ) @property diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py index 4e382b77095c..fb296294aec3 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py @@ -105,7 +105,7 @@ def test_correct_number_of_partitions( ) -> None: """Test if the # of available partitions is equal to # of unique clients.""" _, partitioner = _dummy_setup(num_rows, num_unique_natural_ids) - _ = partitioner.load_partition(idx=0) + _ = partitioner.load_partition(node_id=0) self.assertEqual(len(partitioner.node_id_to_natural_id), num_unique_natural_ids) def test_cannot_set_node_id_to_natural_id(self) -> None: diff --git a/datasets/flwr_datasets/partitioner/partitioner.py b/datasets/flwr_datasets/partitioner/partitioner.py index f154d265175a..7ab15a54d49e 100644 --- a/datasets/flwr_datasets/partitioner/partitioner.py +++ b/datasets/flwr_datasets/partitioner/partitioner.py @@ -53,12 +53,12 @@ def dataset(self, value: Dataset) -> None: self._dataset = value @abstractmethod - def load_partition(self, idx: int) -> Dataset: + def load_partition(self, node_id: int) -> Dataset: """Load a single partition based on the partition index. Parameters ---------- - idx: int + node_id: int the index that corresponds to the requested partition Returns diff --git a/datasets/flwr_datasets/partitioner/size_partitioner.py b/datasets/flwr_datasets/partitioner/size_partitioner.py index 27e1f528dc52..35ca750949ee 100644 --- a/datasets/flwr_datasets/partitioner/size_partitioner.py +++ b/datasets/flwr_datasets/partitioner/size_partitioner.py @@ -64,14 +64,14 @@ def __init__( # A flag to perform only a single compute to determine the indices self._node_id_to_indices_determined = False - def load_partition(self, idx: int) -> datasets.Dataset: + def load_partition(self, node_id: int) -> datasets.Dataset: """Load a single partition based on the partition index. - For this partitioner the number of samples is dependent on the partition idx. + The number of samples is dependent on the partition node_id. Parameters ---------- - idx : int + node_id : int the index that corresponds to the requested partition Returns @@ -82,7 +82,7 @@ def load_partition(self, idx: int) -> datasets.Dataset: # The partitioning is done lazily - only when the first partition is requested. # A single run creates the indices assignments for all the partition indices. self._determine_node_id_to_indices_if_needed() - return self.dataset.select(self._node_id_to_indices[idx]) + return self.dataset.select(self._node_id_to_indices[node_id]) @property def node_id_to_size(self) -> Dict[int, int]: diff --git a/datasets/flwr_datasets/partitioner/square_partitioner.py b/datasets/flwr_datasets/partitioner/square_partitioner.py index 25affaf304ef..109b8397870b 100644 --- a/datasets/flwr_datasets/partitioner/square_partitioner.py +++ b/datasets/flwr_datasets/partitioner/square_partitioner.py @@ -21,7 +21,7 @@ class SquarePartitioner(SizePartitioner): - """Partitioner creates partitions of size that are correlated with squared idx. + """Partitioner creates partitions of size that are correlated with squared node_id. The amount of data each client gets is correlated with the squared partition ID. For instance, if the IDs range from 1 to M, client with ID 1 gets 1 unit of data, From 60199e7a0dcb84462963eb029c41735dd9f0e84d Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 15 Nov 2023 15:26:35 +0100 Subject: [PATCH 08/64] Clarify partitioners parameter description in docs (#2603) --- datasets/flwr_datasets/federated_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py index cad137a98ba8..2f755a124afb 100644 --- a/datasets/flwr_datasets/federated_dataset.py +++ b/datasets/flwr_datasets/federated_dataset.py @@ -49,7 +49,8 @@ class FederatedDataset: partitioners : Dict[str, Union[Partitioner, int]] A dictionary mapping the Dataset split (a `str`) to a `Partitioner` or an `int` (representing the number of IID partitions that this split should be partitioned - into). + into). One or multiple `Partitioner`s can be specified in that manner, but at + most, one per split. shuffle : bool Whether to randomize the order of samples. Applied prior to resplitting, speratelly to each of the present splits in the dataset. It uses the `seed` From 59b8c8c7be81b3fff95d64943ce267818a1131ff Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 15 Nov 2023 15:51:36 +0100 Subject: [PATCH 09/64] Fix spacing around : in parameters in docs (#2601) --- datasets/flwr_datasets/partitioner/iid_partitioner.py | 4 ++-- .../flwr_datasets/partitioner/natural_id_partitioner.py | 2 +- datasets/flwr_datasets/partitioner/partitioner.py | 6 +++--- datasets/flwr_datasets/resplitter/merge_resplitter.py | 2 +- datasets/flwr_datasets/utils.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/iid_partitioner.py b/datasets/flwr_datasets/partitioner/iid_partitioner.py index 2df0d7df1ebf..c72b34f081f2 100644 --- a/datasets/flwr_datasets/partitioner/iid_partitioner.py +++ b/datasets/flwr_datasets/partitioner/iid_partitioner.py @@ -24,7 +24,7 @@ class IidPartitioner(Partitioner): Parameters ---------- - num_partitions: int + num_partitions : int The total number of partitions that the data will be divided into. """ @@ -44,7 +44,7 @@ def load_partition(self, node_id: int) -> datasets.Dataset: Returns ------- - dataset_partition: Dataset + dataset_partition : Dataset single dataset partition """ return self.dataset.shard( diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index bc6c19cd9cb3..b8f28696f3b7 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -55,7 +55,7 @@ def load_partition(self, node_id: int) -> datasets.Dataset: Returns ------- - dataset_partition: Dataset + dataset_partition : Dataset single dataset partition """ if len(self._node_id_to_natural_id) == 0: diff --git a/datasets/flwr_datasets/partitioner/partitioner.py b/datasets/flwr_datasets/partitioner/partitioner.py index 7ab15a54d49e..92405152efc6 100644 --- a/datasets/flwr_datasets/partitioner/partitioner.py +++ b/datasets/flwr_datasets/partitioner/partitioner.py @@ -58,12 +58,12 @@ def load_partition(self, node_id: int) -> Dataset: Parameters ---------- - node_id: int + node_id : int the index that corresponds to the requested partition Returns ------- - dataset_partition: Dataset + dataset_partition : Dataset single dataset partition """ @@ -75,7 +75,7 @@ def is_dataset_assigned(self) -> bool: Returns ------- - dataset_assigned: bool + dataset_assigned : bool True if a dataset is assigned, otherwise False. """ return self._dataset is not None diff --git a/datasets/flwr_datasets/resplitter/merge_resplitter.py b/datasets/flwr_datasets/resplitter/merge_resplitter.py index 995b0e8e5602..6bb8f23e60dc 100644 --- a/datasets/flwr_datasets/resplitter/merge_resplitter.py +++ b/datasets/flwr_datasets/resplitter/merge_resplitter.py @@ -32,7 +32,7 @@ class MergeResplitter: Parameters ---------- - merge_config: Dict[str, Tuple[str, ...]] + merge_config : Dict[str, Tuple[str, ...]] Dictionary with keys - the desired split names to values - tuples of the current split names that will be merged together diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py index 49c65e9893a7..61fdc5b1db83 100644 --- a/datasets/flwr_datasets/utils.py +++ b/datasets/flwr_datasets/utils.py @@ -38,12 +38,12 @@ def _instantiate_partitioners( Parameters ---------- - partitioners: Dict[str, Union[Partitioner, int]] + partitioners : Dict[str, Union[Partitioner, int]] Dataset split to the Partitioner or a number of IID partitions. Returns ------- - partitioners: Dict[str, Partitioner] + partitioners : Dict[str, Partitioner] Partitioners specified as split to Partitioner object. """ instantiated_partitioners: Dict[str, Partitioner] = {} From d8fef6e3dfa8a5e92aebf7dbf740082a9c28c43d Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 15 Nov 2023 16:42:17 +0100 Subject: [PATCH 10/64] Move the Resplitter type definition to resplitter package (#2604) --- datasets/flwr_datasets/common/__init__.py | 5 ----- datasets/flwr_datasets/federated_dataset.py | 2 +- datasets/flwr_datasets/resplitter/__init__.py | 2 ++ .../{common/typing.py => resplitter/resplitter.py} | 2 +- datasets/flwr_datasets/utils.py | 2 +- 5 files changed, 5 insertions(+), 8 deletions(-) rename datasets/flwr_datasets/{common/typing.py => resplitter/resplitter.py} (95%) diff --git a/datasets/flwr_datasets/common/__init__.py b/datasets/flwr_datasets/common/__init__.py index a6468bcf7fda..b4f12f8641b3 100644 --- a/datasets/flwr_datasets/common/__init__.py +++ b/datasets/flwr_datasets/common/__init__.py @@ -13,8 +13,3 @@ # limitations under the License. # ============================================================================== """Common components in Flower Datasets.""" - - -from .typing import Resplitter - -__all__ = ["Resplitter"] diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py index 2f755a124afb..d102df2c9ae6 100644 --- a/datasets/flwr_datasets/federated_dataset.py +++ b/datasets/flwr_datasets/federated_dataset.py @@ -19,8 +19,8 @@ import datasets from datasets import Dataset, DatasetDict -from flwr_datasets.common import Resplitter from flwr_datasets.partitioner import Partitioner +from flwr_datasets.resplitter import Resplitter from flwr_datasets.utils import ( _check_if_dataset_tested, _instantiate_partitioners, diff --git a/datasets/flwr_datasets/resplitter/__init__.py b/datasets/flwr_datasets/resplitter/__init__.py index f778d2096b76..e0b2dc0dcc1c 100644 --- a/datasets/flwr_datasets/resplitter/__init__.py +++ b/datasets/flwr_datasets/resplitter/__init__.py @@ -16,7 +16,9 @@ from .merge_resplitter import MergeResplitter +from .resplitter import Resplitter __all__ = [ "MergeResplitter", + "Resplitter", ] diff --git a/datasets/flwr_datasets/common/typing.py b/datasets/flwr_datasets/resplitter/resplitter.py similarity index 95% rename from datasets/flwr_datasets/common/typing.py rename to datasets/flwr_datasets/resplitter/resplitter.py index 28e6bae4a505..206e2e85730c 100644 --- a/datasets/flwr_datasets/common/typing.py +++ b/datasets/flwr_datasets/resplitter/resplitter.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Flower Datasets type definitions.""" +"""Resplitter.""" from typing import Callable diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py index 61fdc5b1db83..e3d0fdfffa63 100644 --- a/datasets/flwr_datasets/utils.py +++ b/datasets/flwr_datasets/utils.py @@ -18,8 +18,8 @@ import warnings from typing import Dict, Optional, Tuple, Union, cast -from flwr_datasets.common import Resplitter from flwr_datasets.partitioner import IidPartitioner, Partitioner +from flwr_datasets.resplitter import Resplitter from flwr_datasets.resplitter.merge_resplitter import MergeResplitter tested_datasets = [ From 0c3d8b0954117e8e6996f54489210f9e9d5ceced Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 15 Nov 2023 16:49:35 +0100 Subject: [PATCH 11/64] Update FDS README (#2605) --- datasets/README.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/datasets/README.md b/datasets/README.md index f4b3fab73d8e..876b6f453fa5 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -3,7 +3,7 @@ [![GitHub license](https://img.shields.io/github/license/adap/flower)](https://github.com/adap/flower/blob/main/LICENSE) [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/adap/flower/blob/main/CONTRIBUTING.md) ![Build](https://github.com/adap/flower/actions/workflows/framework.yml/badge.svg) -![Downloads](https://pepy.tech/badge/flwr) +![Downloads](https://pepy.tech/badge/flwr-datasets) [![Slack](https://img.shields.io/badge/Chat-Slack-red)](https://flower.dev/join-slack) Flower Datasets (`flwr-datasets`) is a library to quickly and easily create datasets for federated learning, federated evaluation, and federated analytics. It was created by the `Flower Labs` team that also created Flower: A Friendly Federated Learning Framework. @@ -22,7 +22,13 @@ Thanks to using Hugging Face's `datasets` used under the hood, Flower Datasets i * Arrow. Create **custom partitioning schemes** or choose from the **implemented partitioning schemes**: +* Partitioner (the abstract base class) `Partitioner` * IID partitioning `IidPartitioner(num_partitions)` +* Natural ID partitioner `NaturalIdPartitioner` +* Size partitioner (the abstract base class for the partitioners dictating the division based the number of samples) `SizePartitioner` +* Linear partitioner `LinearPartitioner` +* Square partitioner `SquarePartitioner` +* Exponential partitioner `ExponentialPartitioner` * more to come in future releases. # Installation @@ -53,9 +59,9 @@ If you plan to change the type of the dataset to run the code with your ML frame # Usage -The Flower Datasets exposes `FederatedDataset(dataset, partitioners)` abstraction to represent the dataset needed for federated learning/analytics. It has two powerful methods that let you handle the dataset preprocessing. They are `load_partition(node_id, split)` and `load_full(split)`. +Flower Datasets exposes the `FederatedDataset` abstraction to represent the dataset needed for federated learning/evaluation/analytics. It has two powerful methods that let you handle the dataset preprocessing: `load_partition(node_id, split)` and `load_full(split)`. -Here's a quick example of how to partition the MNIST dataset: +Here's a basic quickstart example of how to partition the MNIST dataset: ``` from flwr_datasets import FederatedDataset @@ -68,23 +74,17 @@ mnist_partition_0 = mnist_fds.load_partition(0, "train") centralized_data = mnist_fds.load_full("test") ``` -`FederatedDataset(dataset, partitioners)` allows you specification of: - -* `dataset:str` - the name of the dataset. - -* `partitioners: Dict[str: int]` - `{split_name: str` to `number-of-partitions: int}` - partitioner that will be used with an associated split of the dataset e.g. `{"train": 100}`. It assumes by default the i.i.d. partitioning. - -More customization of `partitioners` is coming in future releases. +For more details, please refer to the specific how-to guides or tutorial. They showcase customization and more advanced features. # Future release Here are a few of the things that we will work on in future releases: -* Support for more datasets (especially the ones that have user id present). -* Creation of custom `Partitioner`s. -* More out-of-the-box `Partitioner`s. -* Passing `Partitioner`s via `FederatedDataset`'s `partitioner` argument. -* Customization of the dataset splitting before the partitioning. +* ✅ Support for more datasets (especially the ones that have user id present). +* ✅ Creation of custom `Partitioner`s. +* ✅ More out-of-the-box `Partitioner`s. +* ✅ Passing `Partitioner`s via `FederatedDataset`'s `partitioners` argument. +* ✅ Customization of the dataset splitting before the partitioning. * Simplification of the dataset transformation to the popular frameworks/types. * Creation of the synthetic data, * Support for Vertical FL. From eead07f5b721f347ee0bc836e6d8515054b7ce6e Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 15 Nov 2023 16:58:10 +0100 Subject: [PATCH 12/64] Bump up the FDS library version (#2606) --- datasets/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/pyproject.toml b/datasets/pyproject.toml index 62eaea3e5d48..7145345f7d94 100644 --- a/datasets/pyproject.toml +++ b/datasets/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "flwr-datasets" -version = "0.0.1" +version = "0.0.2" description = "Flower Datasets" license = "Apache-2.0" authors = ["The Flower Authors "] From 760a4564b075fdaed7d3ee8b06607629fe84edf9 Mon Sep 17 00:00:00 2001 From: Heng Pan <134433891+panh99@users.noreply.github.com> Date: Wed, 15 Nov 2023 16:19:11 +0000 Subject: [PATCH 13/64] Fix incorrect docstring for 'certificates' in 'start_driver' API (#2607) --- src/py/flwr/driver/app.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/py/flwr/driver/app.py b/src/py/flwr/driver/app.py index 7c2c576c2c43..3cb8652365d8 100644 --- a/src/py/flwr/driver/app.py +++ b/src/py/flwr/driver/app.py @@ -19,7 +19,8 @@ import threading import time from logging import INFO -from typing import Dict, Optional +from pathlib import Path +from typing import Dict, Optional, Union from flwr.common import EventType, event from flwr.common.address import parse_address @@ -51,7 +52,7 @@ def start_driver( # pylint: disable=too-many-arguments, too-many-locals config: Optional[ServerConfig] = None, strategy: Optional[Strategy] = None, client_manager: Optional[ClientManager] = None, - certificates: Optional[bytes] = None, + root_certificates: Optional[Union[bytes, str]] = None, ) -> History: """Start a Flower Driver API server. @@ -76,14 +77,10 @@ def start_driver( # pylint: disable=too-many-arguments, too-many-locals `flwr.driver.driver_client_manager.DriverClientManager`. If no implementation is provided, then `start_driver` will use `flwr.driver.driver_client_manager.DriverClientManager`. - certificates : bytes (default: None) - Tuple containing root certificate, server certificate, and private key - to start a secure SSL-enabled server. The tuple is expected to have - three bytes elements in the following order: - - * CA certificate. - * server certificate. - * server private key. + root_certificates : Optional[Union[bytes, str]] (default: None) + The PEM-encoded root certificates as a byte string or a path string. + If provided, a secure connection using the certificates will be + established to an SSL-enabled Flower server. Returns ------- @@ -99,7 +96,7 @@ def start_driver( # pylint: disable=too-many-arguments, too-many-locals Starting a driver that connects to an SSL-enabled server: >>> start_driver( - >>> certificates=Path("/crts/root.pem").read_bytes() + >>> root_certificates=Path("/crts/root.pem").read_bytes() >>> ) """ event(EventType.START_DRIVER_ENTER) @@ -112,7 +109,9 @@ def start_driver( # pylint: disable=too-many-arguments, too-many-locals address = f"[{host}]:{port}" if is_v6 else f"{host}:{port}" # Create the Driver - driver = GrpcDriver(driver_service_address=address, certificates=certificates) + if isinstance(root_certificates, str): + root_certificates = Path(root_certificates).read_bytes() + driver = GrpcDriver(driver_service_address=address, certificates=root_certificates) driver.connect() lock = threading.Lock() From f0561759aafcbba2283ce308ef1eecad6de8dff2 Mon Sep 17 00:00:00 2001 From: Yan Gao Date: Wed, 15 Nov 2023 18:42:18 +0000 Subject: [PATCH 14/64] Quickstart-xgboost with bagging aggregation (#2554) Co-authored-by: yan-gao-GY --- examples/xgboost-comprehensive/README.md | 87 +++++++++ examples/xgboost-comprehensive/client.py | 174 ++++++++++++++++++ examples/xgboost-comprehensive/dataset.py | 67 +++++++ examples/xgboost-comprehensive/pyproject.toml | 15 ++ .../xgboost-comprehensive/requirements.txt | 3 + examples/xgboost-comprehensive/run.sh | 17 ++ examples/xgboost-comprehensive/server.py | 111 +++++++++++ examples/xgboost-comprehensive/strategy.py | 139 ++++++++++++++ examples/xgboost-comprehensive/utils.py | 72 ++++++++ 9 files changed, 685 insertions(+) create mode 100644 examples/xgboost-comprehensive/README.md create mode 100644 examples/xgboost-comprehensive/client.py create mode 100644 examples/xgboost-comprehensive/dataset.py create mode 100644 examples/xgboost-comprehensive/pyproject.toml create mode 100644 examples/xgboost-comprehensive/requirements.txt create mode 100755 examples/xgboost-comprehensive/run.sh create mode 100644 examples/xgboost-comprehensive/server.py create mode 100644 examples/xgboost-comprehensive/strategy.py create mode 100644 examples/xgboost-comprehensive/utils.py diff --git a/examples/xgboost-comprehensive/README.md b/examples/xgboost-comprehensive/README.md new file mode 100644 index 000000000000..3801d4813a26 --- /dev/null +++ b/examples/xgboost-comprehensive/README.md @@ -0,0 +1,87 @@ +# Flower Example using XGBoost + +This example demonstrates how to perform EXtreme Gradient Boosting (XGBoost) within Flower using `xgboost` package. +Tree-based with bagging method is used for aggregation on the server. + +## Project Setup + +Start by cloning the example project. We prepared a single-line command that you can copy into your shell which will checkout the example for you: + +```shell +git clone --depth=1 https://github.com/adap/flower.git && mv flower/examples/quickstart-xgboost . && rm -rf flower && cd quickstart-xgboost +``` + +This will create a new directory called `quickstart-xgboost` containing the following files: + +``` +-- README.md <- Your're reading this right now +-- server.py <- Defines the server-side logic +-- strategy.py <- Defines the tree-based bagging aggregation +-- client.py <- Defines the client-side logic +-- dataset.py <- Defines the functions of data loading and partitioning +-- pyproject.toml <- Example dependencies (if you use Poetry) +-- requirements.txt <- Example dependencies +``` + +### Installing Dependencies + +Project dependencies (such as `xgboost` and `flwr`) are defined in `pyproject.toml` and `requirements.txt`. We recommend [Poetry](https://python-poetry.org/docs/) to install those dependencies and manage your virtual environment ([Poetry installation](https://python-poetry.org/docs/#installation)) or [pip](https://pip.pypa.io/en/latest/development/), but feel free to use a different way of installing dependencies and managing virtual environments if you have other preferences. + +#### Poetry + +```shell +poetry install +poetry shell +``` + +Poetry will install all your dependencies in a newly created virtual environment. To verify that everything works correctly you can run the following command: + +```shell +poetry run python3 -c "import flwr" +``` + +If you don't see any errors you're good to go! + +#### pip + +Write the command below in your terminal to install the dependencies according to the configuration file requirements.txt. + +```shell +pip install -r requirements.txt +``` + +## Run Federated Learning with XGBoost and Flower + +Afterwards you are ready to start the Flower server as well as the clients. +You can simply start the server in a terminal as follows: + +```shell +python3 server.py +``` + +Now you are ready to start the Flower clients which will participate in the learning. +To do so simply open two more terminal windows and run the following commands. + +Start client 1 in the first terminal: + +```shell +python3 client.py --node-id=0 +``` + +Start client 2 in the second terminal: + +```shell +python3 client.py --node-id=1 +``` + +You will see that XGBoost is starting a federated training. + +Alternatively, you can use `run.sh` to run the same experiment in a single terminal as follows: + +```shell +bash run.sh +``` + +Besides, we provide options to customise the experimental settings, including data partitioning and centralised/distributed evaluation (see `utils.py`). +Look at the [code](https://github.com/adap/flower/tree/main/examples/quickstart-xgboost) +and [tutorial](https://flower.dev/docs/framework/tutorial-quickstart-xgboost.html) for a detailed explanation. diff --git a/examples/xgboost-comprehensive/client.py b/examples/xgboost-comprehensive/client.py new file mode 100644 index 000000000000..5aba30266b5a --- /dev/null +++ b/examples/xgboost-comprehensive/client.py @@ -0,0 +1,174 @@ +import warnings +from logging import INFO +import xgboost as xgb + +import flwr as fl +from flwr_datasets import FederatedDataset +from flwr.common.logger import log +from flwr.common import ( + Code, + EvaluateIns, + EvaluateRes, + FitIns, + FitRes, + GetParametersIns, + GetParametersRes, + Parameters, + Status, +) + +from dataset import ( + instantiate_partitioner, + train_test_split, + transform_dataset_to_dmatrix, + resplit, +) +from utils import client_args_parser + + +warnings.filterwarnings("ignore", category=UserWarning) + + +# Parse arguments for experimental settings +args = client_args_parser() + +# Load (HIGGS) dataset and conduct partitioning +num_partitions = args.num_partitions + +# Partitioner type is chosen from ["uniform", "linear", "square", "exponential"] +partitioner_type = args.partitioner_type + +# Instantiate partitioner +partitioner = instantiate_partitioner( + partitioner_type=partitioner_type, num_partitions=num_partitions +) +fds = FederatedDataset( + dataset="jxie/higgs", partitioners={"train": partitioner}, resplitter=resplit +) + +# Let's use the first partition as an example +node_id = args.node_id +partition = fds.load_partition(idx=node_id, split="train") +partition.set_format("numpy") + +if args.centralised_eval: + # Use centralised test set for evaluation + train_data = partition + valid_data = fds.load_full("test") + valid_data.set_format("numpy") + num_train = train_data.shape[0] + num_val = valid_data.shape[0] +else: + # Train/test splitting + SEED = args.seed + test_fraction = args.test_fraction + train_data, valid_data, num_train, num_val = train_test_split( + partition, test_fraction=test_fraction, seed=SEED + ) + +# Reformat data to DMatrix for xgboost +train_dmatrix = transform_dataset_to_dmatrix(train_data) +valid_dmatrix = transform_dataset_to_dmatrix(valid_data) + + +# Hyper-parameters for xgboost training +num_local_round = 1 +params = { + "objective": "binary:logistic", + "eta": 0.1, # Learning rate + "max_depth": 8, + "eval_metric": "auc", + "nthread": 16, + "num_parallel_tree": 1, + "subsample": 1, + "tree_method": "hist", +} + + +# Define Flower client +class FlowerClient(fl.client.Client): + def __init__(self): + self.bst = None + self.config = None + + def get_parameters(self, ins: GetParametersIns) -> GetParametersRes: + _ = (self, ins) + return GetParametersRes( + status=Status( + code=Code.OK, + message="OK", + ), + parameters=Parameters(tensor_type="", tensors=[]), + ) + + def _local_boost(self): + # Update trees based on local training data. + for i in range(num_local_round): + self.bst.update(train_dmatrix, self.bst.num_boosted_rounds()) + + # Extract the last N=num_local_round trees for sever aggregation + bst = self.bst[ + self.bst.num_boosted_rounds() + - num_local_round : self.bst.num_boosted_rounds() + ] + + return bst + + def fit(self, ins: FitIns) -> FitRes: + if not self.bst: + # First round local training + log(INFO, "Start training at round 1") + bst = xgb.train( + params, + train_dmatrix, + num_boost_round=num_local_round, + evals=[(valid_dmatrix, "validate"), (train_dmatrix, "train")], + ) + self.config = bst.save_config() + self.bst = bst + else: + for item in ins.parameters.tensors: + global_model = bytearray(item) + + # Load global model into booster + self.bst.load_model(global_model) + self.bst.load_config(self.config) + + bst = self._local_boost() + + local_model = bst.save_raw("json") + local_model_bytes = bytes(local_model) + + return FitRes( + status=Status( + code=Code.OK, + message="OK", + ), + parameters=Parameters(tensor_type="", tensors=[local_model_bytes]), + num_examples=num_train, + metrics={}, + ) + + def evaluate(self, ins: EvaluateIns) -> EvaluateRes: + eval_results = self.bst.eval_set( + evals=[(valid_dmatrix, "valid")], + iteration=self.bst.num_boosted_rounds() - 1, + ) + auc = round(float(eval_results.split("\t")[1].split(":")[1]), 4) + + global_round = ins.config["global_round"] + log(INFO, f"AUC = {auc} at round {global_round}") + + return EvaluateRes( + status=Status( + code=Code.OK, + message="OK", + ), + loss=0.0, + num_examples=num_val, + metrics={"AUC": auc}, + ) + + +# Start Flower client +fl.client.start_client(server_address="127.0.0.1:8080", client=FlowerClient()) diff --git a/examples/xgboost-comprehensive/dataset.py b/examples/xgboost-comprehensive/dataset.py new file mode 100644 index 000000000000..80c978f1077b --- /dev/null +++ b/examples/xgboost-comprehensive/dataset.py @@ -0,0 +1,67 @@ +import xgboost as xgb +from typing import Callable, Dict, List, Optional, Tuple, Union +from datasets import Dataset, DatasetDict, concatenate_datasets +from flwr_datasets.partitioner import ( + IidPartitioner, + LinearPartitioner, + SquarePartitioner, + ExponentialPartitioner, +) + +CORRELATION_TO_PARTITIONER = { + "uniform": IidPartitioner, + "linear": LinearPartitioner, + "square": SquarePartitioner, + "exponential": ExponentialPartitioner, +} + + +def instantiate_partitioner(partitioner_type: str, num_partitions: int): + """Initialise partitioner based on selected partitioner type and number of + partitions.""" + partitioner = CORRELATION_TO_PARTITIONER[partitioner_type]( + num_partitions=num_partitions + ) + return partitioner + + +def train_test_split(partition: Dataset, test_fraction: float, seed: int): + """Split the data into train and validation set given split rate.""" + train_test = partition.train_test_split(test_size=test_fraction, seed=seed) + partition_train = train_test["train"] + partition_test = train_test["test"] + + num_train = len(partition_train) + num_test = len(partition_test) + + return partition_train, partition_test, num_train, num_test + + +def transform_dataset_to_dmatrix(data: Union[Dataset, DatasetDict]) -> xgb.core.DMatrix: + """Transform dataset to DMatrix format for xgboost.""" + x = data["inputs"] + y = data["label"] + new_data = xgb.DMatrix(x, label=y) + return new_data + + +def resplit(dataset: DatasetDict) -> DatasetDict: + """Increase the quantity of centralised test samples from 500K to 1M.""" + return DatasetDict( + { + "train": dataset["train"].select( + range(0, dataset["train"].num_rows - 500_000) + ), + "test": concatenate_datasets( + [ + dataset["train"].select( + range( + dataset["train"].num_rows - 500_000, + dataset["train"].num_rows, + ) + ), + dataset["test"], + ] + ), + } + ) diff --git a/examples/xgboost-comprehensive/pyproject.toml b/examples/xgboost-comprehensive/pyproject.toml new file mode 100644 index 000000000000..5414b5122154 --- /dev/null +++ b/examples/xgboost-comprehensive/pyproject.toml @@ -0,0 +1,15 @@ +[build-system] +requires = ["poetry-core>=1.4.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "xgboost-comprehensive" +version = "0.1.0" +description = "Federated XGBoost with Flower (comprehensive)" +authors = ["The Flower Authors "] + +[tool.poetry.dependencies] +python = ">=3.8,<3.11" +flwr = ">=1.0,<2.0" +flwr-datasets = ">=0.0.2,<1.0.0" +xgboost = ">=2.0.0,<3.0.0" diff --git a/examples/xgboost-comprehensive/requirements.txt b/examples/xgboost-comprehensive/requirements.txt new file mode 100644 index 000000000000..c6b9c1a67894 --- /dev/null +++ b/examples/xgboost-comprehensive/requirements.txt @@ -0,0 +1,3 @@ +flwr>=1.0, <2.0 +flwr-datasets>=0.0.2, <1.0.0 +xgboost>=2.0.0, <3.0.0 diff --git a/examples/xgboost-comprehensive/run.sh b/examples/xgboost-comprehensive/run.sh new file mode 100755 index 000000000000..7cf65fa4d52d --- /dev/null +++ b/examples/xgboost-comprehensive/run.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e +cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"/ + +echo "Starting server" +python server.py & +sleep 15 # Sleep for 15s to give the server enough time to start + +for i in `seq 0 1`; do + echo "Starting client $i" + python3 client.py --node-id=$i & +done + +# Enable CTRL+C to stop all background processes +trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM +# Wait for all background processes to complete +wait diff --git a/examples/xgboost-comprehensive/server.py b/examples/xgboost-comprehensive/server.py new file mode 100644 index 000000000000..857e99528013 --- /dev/null +++ b/examples/xgboost-comprehensive/server.py @@ -0,0 +1,111 @@ +from typing import Dict +from logging import INFO +import xgboost as xgb + +import flwr as fl +from flwr.common.logger import log +from flwr.common import Parameters, Scalar +from flwr_datasets import FederatedDataset + +from strategy import FedXgbBagging +from utils import server_args_parser +from dataset import resplit, transform_dataset_to_dmatrix + + +# Parse arguments for experimental settings +args = server_args_parser() +pool_size = args.pool_size +num_rounds = args.num_rounds +num_clients_per_round = args.num_clients_per_round +num_evaluate_clients = args.num_evaluate_clients +centralised_eval = args.centralised_eval + +# Load centralised test set +if centralised_eval: + fds = FederatedDataset( + dataset="jxie/higgs", partitioners={"train": 20}, resplitter=resplit + ) + test_set = fds.load_full("test") + test_set.set_format("numpy") + test_dmatrix = transform_dataset_to_dmatrix(test_set) + +# Hyper-parameters used for initialisation +params = { + "objective": "binary:logistic", + "eta": 0.1, # Learning rate + "max_depth": 8, + "eval_metric": "auc", + "nthread": 16, + "num_parallel_tree": 1, + "subsample": 1, + "tree_method": "hist", +} + + +def eval_config(rnd: int) -> Dict[str, str]: + """Return a configuration with global epochs.""" + config = { + "global_round": str(rnd), + } + return config + + +def evaluate_metrics_aggregation(eval_metrics): + """Return an aggregated metric (AUC) for evaluation.""" + total_num = sum([num for num, _ in eval_metrics]) + auc_aggregated = ( + sum([metrics["AUC"] * num for num, metrics in eval_metrics]) / total_num + ) + metrics_aggregated = {"AUC": auc_aggregated} + return metrics_aggregated + + +def get_evaluate_fn(test_data): + """Return a function for centralised evaluation.""" + + def evaluate_fn( + server_round: int, parameters: Parameters, config: Dict[str, Scalar] + ): + # If at the first round, skip the evaluation + if server_round == 0: + return 0, {} + else: + bst = xgb.Booster(params=params) + for para in parameters.tensors: + para_b = bytearray(para) + + # Load global model + bst.load_model(para_b) + # Run evaluation + eval_results = bst.eval_set( + evals=[(test_data, "valid")], + iteration=bst.num_boosted_rounds() - 1, + ) + auc = round(float(eval_results.split("\t")[1].split(":")[1]), 4) + log(INFO, f"AUC = {auc} at round {server_round}") + + return 0, {"AUC": auc} + + return evaluate_fn + + +# Define strategy +strategy = FedXgbBagging( + evaluate_function=get_evaluate_fn(test_dmatrix) if centralised_eval else None, + fraction_fit=(float(num_clients_per_round) / pool_size), + min_fit_clients=num_clients_per_round, + min_available_clients=pool_size, + min_evaluate_clients=num_evaluate_clients if not centralised_eval else 0, + fraction_evaluate=1.0 if not centralised_eval else 0.0, + on_evaluate_config_fn=eval_config, + evaluate_metrics_aggregation_fn=evaluate_metrics_aggregation + if not centralised_eval + else None, +) + +# Start Flower server +fl.server.start_server( + server_address="0.0.0.0:8080", + config=fl.server.ServerConfig(num_rounds=num_rounds), + strategy=strategy, +) diff --git a/examples/xgboost-comprehensive/strategy.py b/examples/xgboost-comprehensive/strategy.py new file mode 100644 index 000000000000..814010720a77 --- /dev/null +++ b/examples/xgboost-comprehensive/strategy.py @@ -0,0 +1,139 @@ +from logging import WARNING +from typing import Callable, Dict, List, Optional, Tuple, Union +import flwr as fl +import json + +from flwr.common import ( + EvaluateRes, + FitRes, + Parameters, + Scalar, +) +from flwr.server.client_proxy import ClientProxy +from flwr.common.logger import log + + +class FedXgbBagging(fl.server.strategy.FedAvg): + def __init__( + self, + evaluate_function: Optional[ + Callable[ + [int, Parameters, Dict[str, Scalar]], + Optional[Tuple[float, Dict[str, Scalar]]], + ] + ] = None, + **kwargs, + ): + self.evaluate_function = evaluate_function + self.global_model = None + super().__init__(**kwargs) + + def aggregate_fit( + self, + server_round: int, + results: List[Tuple[ClientProxy, FitRes]], + failures: List[Union[Tuple[ClientProxy, FitRes], BaseException]], + ) -> Tuple[Optional[Parameters], Dict[str, Scalar]]: + """Aggregate fit results using bagging.""" + if not results: + return None, {} + # Do not aggregate if there are failures and failures are not accepted + if not self.accept_failures and failures: + return None, {} + + # Aggregate all the client trees + for _, fit_res in results: + update = fit_res.parameters.tensors + for item in update: + self.global_model = aggregate( + self.global_model, json.loads(bytearray(item)) + ) + + weights_avg = json.dumps(self.global_model) + + return ( + Parameters(tensor_type="", tensors=[bytes(weights_avg, "utf-8")]), + {}, + ) + + def aggregate_evaluate( + self, + server_round: int, + results: List[Tuple[ClientProxy, EvaluateRes]], + failures: List[Union[Tuple[ClientProxy, EvaluateRes], BaseException]], + ) -> Tuple[Optional[float], Dict[str, Scalar]]: + """Aggregate evaluation metrics using average.""" + if not results: + return None, {} + # Do not aggregate if there are failures and failures are not accepted + if not self.accept_failures and failures: + return None, {} + + # Aggregate custom metrics if aggregation fn was provided + metrics_aggregated = {} + if self.evaluate_metrics_aggregation_fn: + eval_metrics = [(res.num_examples, res.metrics) for _, res in results] + metrics_aggregated = self.evaluate_metrics_aggregation_fn(eval_metrics) + elif server_round == 1: # Only log this warning once + log(WARNING, "No evaluate_metrics_aggregation_fn provided") + + return 0, metrics_aggregated + + def evaluate( + self, server_round: int, parameters: Parameters + ) -> Optional[Tuple[float, Dict[str, Scalar]]]: + """Evaluate model parameters using an evaluation function.""" + if self.evaluate_function is None: + # No evaluation function provided + return None + eval_res = self.evaluate_function(server_round, parameters, {}) + if eval_res is None: + return None + loss, metrics = eval_res + return loss, metrics + + +def aggregate(bst_prev: Optional[Dict], bst_curr: Dict) -> Dict: + """Conduct bagging aggregation for given trees.""" + if not bst_prev: + return bst_curr + else: + # Get the tree numbers + tree_num_prev, paral_tree_num_prev = _get_tree_nums(bst_prev) + tree_num_curr, paral_tree_num_curr = _get_tree_nums(bst_curr) + + bst_prev["learner"]["gradient_booster"]["model"]["gbtree_model_param"][ + "num_trees" + ] = str(tree_num_prev + paral_tree_num_curr) + iteration_indptr = bst_prev["learner"]["gradient_booster"]["model"][ + "iteration_indptr" + ] + bst_prev["learner"]["gradient_booster"]["model"]["iteration_indptr"].append( + iteration_indptr[-1] + paral_tree_num_curr + ) + + # Aggregate new trees + trees_curr = bst_curr["learner"]["gradient_booster"]["model"]["trees"] + for tree_count in range(paral_tree_num_curr): + trees_curr[tree_count]["id"] = tree_num_prev + tree_count + bst_prev["learner"]["gradient_booster"]["model"]["trees"].append( + trees_curr[tree_count] + ) + bst_prev["learner"]["gradient_booster"]["model"]["tree_info"].append(0) + return bst_prev + + +def _get_tree_nums(xgb_model: Dict) -> (int, int): + # Get the number of trees + tree_num = int( + xgb_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"][ + "num_trees" + ] + ) + # Get the number of parallel trees + paral_tree_num = int( + xgb_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"][ + "num_parallel_tree" + ] + ) + return tree_num, paral_tree_num diff --git a/examples/xgboost-comprehensive/utils.py b/examples/xgboost-comprehensive/utils.py new file mode 100644 index 000000000000..51c1a1b9604d --- /dev/null +++ b/examples/xgboost-comprehensive/utils.py @@ -0,0 +1,72 @@ +import argparse + + +def client_args_parser(): + """Parse arguments to define experimental settings on client side.""" + parser = argparse.ArgumentParser() + + parser.add_argument( + "--num-partitions", default=10, type=int, help="Number of partitions." + ) + parser.add_argument( + "--partitioner-type", + default="uniform", + type=str, + choices=["uniform", "linear", "square", "exponential"], + help="Partitioner types.", + ) + parser.add_argument( + "--node-id", + default=0, + type=int, + help="Node ID used for the current client.", + ) + parser.add_argument( + "--seed", default=42, type=int, help="Seed used for train/test splitting." + ) + parser.add_argument( + "--test-fraction", + default=0.2, + type=float, + help="Test fraction for train/test splitting.", + ) + parser.add_argument( + "--centralised-eval", + action="store_true", + help="Conduct centralised evaluation (True), or client evaluation on hold-out data (False).", + ) + + args = parser.parse_args() + return args + + +def server_args_parser(): + """Parse arguments to define experimental settings on server side.""" + parser = argparse.ArgumentParser() + + parser.add_argument( + "--pool-size", default=2, type=int, help="Number of total clients." + ) + parser.add_argument( + "--num-rounds", default=5, type=int, help="Number of FL rounds." + ) + parser.add_argument( + "--num-clients-per-round", + default=2, + type=int, + help="Number of clients participate in training each round.", + ) + parser.add_argument( + "--num-evaluate-clients", + default=2, + type=int, + help="Number of clients selected for evaluation.", + ) + parser.add_argument( + "--centralised-eval", + action="store_true", + help="Conduct centralised evaluation (True), or client evaluation on hold-out data (False).", + ) + + args = parser.parse_args() + return args From ea8ac9e982648f90a28edd762c744ab395112374 Mon Sep 17 00:00:00 2001 From: Daniel Nata Nugraha Date: Thu, 16 Nov 2023 09:34:58 +0100 Subject: [PATCH 15/64] Remove existing dockerfiles (#2600) Co-authored-by: Taner Topal --- src/docker/README.md | 6 ------ src/docker/build.sh | 26 -------------------------- src/docker/default.Dockerfile | 8 -------- src/docker/ssh_key | 27 --------------------------- src/docker/ssh_key.pub | 1 - src/docker/sshd.Dockerfile | 32 -------------------------------- 6 files changed, 100 deletions(-) delete mode 100644 src/docker/README.md delete mode 100755 src/docker/build.sh delete mode 100644 src/docker/default.Dockerfile delete mode 100644 src/docker/ssh_key delete mode 100644 src/docker/ssh_key.pub delete mode 100644 src/docker/sshd.Dockerfile diff --git a/src/docker/README.md b/src/docker/README.md deleted file mode 100644 index 0cebf9a6d737..000000000000 --- a/src/docker/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Docker - -We provide two docker images. They can be built using the build.sh script in the ./src/docker directory. -The default image will contain the installed flower library with all dependencies to enable easily -trying out flower. The second image is used in tests as well as local runs of baselines using the -FlowerOps package. diff --git a/src/docker/build.sh b/src/docker/build.sh deleted file mode 100755 index ea6b643c1fb2..000000000000 --- a/src/docker/build.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -# Copyright 2020 Flower Labs GmbH. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -set -e -cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"/../../ - -HASH=$(printf "$(git rev-parse HEAD)\n$(git diff | sha1sum)" | sha1sum | cut -c1-7) - -rm -rf dist -python -m poetry build -docker build -f src/docker/default.Dockerfile -t flower:latest -t flower:$HASH . -docker build -f src/docker/sshd.Dockerfile --build-arg SSH_PUBLIC_KEY="$(cat docker/ssh_key.pub)" -t flower-sshd:latest -t flower-sshd:$HASH . diff --git a/src/docker/default.Dockerfile b/src/docker/default.Dockerfile deleted file mode 100644 index 0ebd803fbd9c..000000000000 --- a/src/docker/default.Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -FROM python:3.7.15-slim-buster - -# Install the biggest dependencies before copying the wheel -RUN pip install tensorflow-cpu==2.6.2 numpy==1.19.5 - -COPY dist/flwr-1.6.0-py3-none-any.whl flwr-1.6.0-py3-none-any.whl -RUN python3.7 -m pip install --no-cache-dir 'flwr-1.6.0-py3-none-any.whl[examples-pytorch,examples-tensorflow,http-logger,baseline,ops]' -RUN rm flwr-1.6.0-py3-none-any.whl diff --git a/src/docker/ssh_key b/src/docker/ssh_key deleted file mode 100644 index c6fd85ac4bf5..000000000000 --- a/src/docker/ssh_key +++ /dev/null @@ -1,27 +0,0 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEpAIBAAKCAQEAvp7rW7/QCAlXeM/U1QESOKD6aBJkLYLyjzD0A/E+rDz5FBV0 -O9gVhIzIwHYznEV9uWbwJfh5GhE5pb2Om2keERRy56/xoxc2LhMvt4fKf+CKoKYU -fl7QYOHmmT6j4vcZX4Ao0ntHBgX6wk2z2F3Ma1lKmRgh8jHPFVRNHsTLV0WArfWS -zegHsGeRN5aLwWjYZ3RIVzCNXsvDozyf88VUn75E2kwBhtg5OH0y5Rvkvm7oqzxt -u4sNLEdcpIhdTSx8lOGqqaBBP4JcFIA1u8dDqSb3qnhOwW6D2rGloKKMkWCYI0RC -IyXOTFWca/rQKBP9gkd4FSxTMNByF5DSpWm4awIDAQABAoIBAGxK/IrzPcxTAk6w -Q8l/AksJqTjWufLhIcwXk4lp1KElQpwWVCJrcE4NbT9p54gwMmzR/jyUHLiicI2o -UwwXe+1YIK9fVXfCCBXE8Kbjjjd36Um7EIl6dAeWTttgqtqWZuPKe2aYgpD5dKxO -WI3jO+8Am1C+VQqzIWKMsgN46cx5xMmJs5WzhqKz//xO2Wrrs2MyPw0QZsHpm1j7 -RYLoPwl6EHN6Y1jWv55s0r3VIAGp4qjuR5BJaQu2ITeWU//TE1ouejMCEO8eLbIH -aI1Ydod9BGxo0ET31GLuV+KM8ilk7+lfP8A2z4MryUEtBgZ+bukxZ6RTOtbXIsuR -ccMAFoECgYEA4GJONbjrrBpEnn0wMKBRFYPpZ1chDC9X0qC0yUfatuiB4PzSsGEQ -phJzCEX+fRG8giFTtWBzavBFdvto5JmYHnAoZ08oZqfI75ID5FmxmBHDbhKisUiF -BCSo2MhWcQu/T7R26WaBDjWqal/xNEdIr5Gw3TP9reIU15xwLfe7eP8CgYEA2Xq/ -6pAKkxmjiOTDRwosyQ6g+d4uLAT8RClt/vLj5FUKtL3TXuCDFrA9iNROmHlAuj05 -PwV+3yV13y+BvkweaX5fIeY10axpr7nlozM2UUVnotTM8mrigPUNkP3jc+3DbtbF -JJgvJgB6e/8fc7L/hf5yoALFeqnPVuoZ82rHtJUCgYEAqTobCDNeMt9gzzNVa+cN -IC/rkU/KEdmG38jqcD+r+XTjt16WPAYIu2eAsEWxyHve8znVWx7SDV4G1dSI6Jfa -fWJTZo7xxnJWfjBelRL8eNc00EhnwVMTVuubGFwHrZyRfbh6Uk9HmALrcuzHComU -jS1Q+ZOjiEfRNke650BGeNsCgYBvKfJNWIE/PZy8zYoPluDbiJG6Btp8/7Net/qU -ldztCpclWd5gIcx/3Wgmt7bQ8zGlwMOxNgLq1wAQK/1aoxfsfn7sGfsqiMGg5BnA -ON4r3pkhvzpfM0lB94COu2f3CvKZhkhQ80UuXJI/LWILRtSbzjsTFTeSApiL6a+I -H+2inQKBgQC0oTcFDUkuS4FRZb/SDfWIDm6RUVuRaSNJuX4aGlGAWFvtQxG+7Avz -VroKqjMvtI5h8VutE8qELRQOdC0TfaUjjhWZ7oX4TxSVqzAtHnXt1PmvS0ih19cY -ckpB+gFPXpgR1GwBi8GEcsWX2aNrYhCMvqD38bI2W8f23xciaSm4wg== ------END RSA PRIVATE KEY----- diff --git a/src/docker/ssh_key.pub b/src/docker/ssh_key.pub deleted file mode 100644 index 302b15494f93..000000000000 --- a/src/docker/ssh_key.pub +++ /dev/null @@ -1 +0,0 @@ -ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC+nutbv9AICVd4z9TVARI4oPpoEmQtgvKPMPQD8T6sPPkUFXQ72BWEjMjAdjOcRX25ZvAl+HkaETmlvY6baR4RFHLnr/GjFzYuEy+3h8p/4IqgphR+XtBg4eaZPqPi9xlfgCjSe0cGBfrCTbPYXcxrWUqZGCHyMc8VVE0exMtXRYCt9ZLN6AewZ5E3lovBaNhndEhXMI1ey8OjPJ/zxVSfvkTaTAGG2Dk4fTLlG+S+buirPG27iw0sR1ykiF1NLHyU4aqpoEE/glwUgDW7x0OpJveqeE7BboPasaWgooyRYJgjREIjJc5MVZxr+tAoE/2CR3gVLFMw0HIXkNKlabhr tanto@R2U2 diff --git a/src/docker/sshd.Dockerfile b/src/docker/sshd.Dockerfile deleted file mode 100644 index fc310dc5e1ce..000000000000 --- a/src/docker/sshd.Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -FROM python:3.7.15-slim-buster - -RUN apt-get update -RUN apt-get install -y openssh-server screen -RUN mkdir /var/run/sshd - -RUN echo 'root:root' | chpasswd - -RUN sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config -RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config - -RUN mkdir /root/.ssh - -ARG SSH_PUBLIC_KEY -RUN echo $SSH_PUBLIC_KEY > /root/.ssh/authorized_keys - -RUN apt-get clean && \ - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -WORKDIR /root - -RUN python3.7 -m pip install tensorflow-cpu==2.6.0 torch==1.7.1 torchvision==0.8.2 numpy==1.19.5 -COPY dist/flwr-1.6.0-py3-none-any.whl flwr-1.6.0-py3-none-any.whl -RUN python3.7 -m pip install --no-cache-dir 'flwr-1.6.0-py3-none-any.whl[examples-pytorch,examples-tensorflow,http-logger,baseline,ops]' && \ - rm flwr-1.6.0-py3-none-any.whl - -RUN python3.7 -m flwr_experimental.baseline.tf_fashion_mnist.download -RUN python3.7 -m flwr_experimental.baseline.tf_cifar.download - -EXPOSE 22 - -CMD ["/usr/sbin/sshd", "-D"] From 7ae14a28c1b7e6751da45aa4a194f8e30ccad6f1 Mon Sep 17 00:00:00 2001 From: Daniel Nata Nugraha Date: Thu, 16 Nov 2023 09:47:23 +0100 Subject: [PATCH 16/64] Add server dockerfile (#2602) Co-authored-by: Taner Topal --- src/docker/server/Dockerfile | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 src/docker/server/Dockerfile diff --git a/src/docker/server/Dockerfile b/src/docker/server/Dockerfile new file mode 100644 index 000000000000..d294013faefa --- /dev/null +++ b/src/docker/server/Dockerfile @@ -0,0 +1,49 @@ +# Copyright 2023 Flower Labs GmbH. All Rights Reserved. + +FROM ubuntu:22.04 as base + +ENV DEBIAN_FRONTEND noninteractive + +# Install system dependencies +RUN apt-get update \ + && apt-get -y --no-install-recommends install \ + clang-format git unzip ca-certificates openssh-client liblzma-dev \ + build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev wget\ + libsqlite3-dev curl llvm libncursesw5-dev xz-utils tk-dev libxml2-dev \ + libxmlsec1-dev libffi-dev liblzma-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install PyEnv and Python +ARG PYTHON_VERSION +ENV PYENV_ROOT /root/.pyenv +ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH +RUN curl -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer | bash +RUN pyenv install ${PYTHON_VERSION} \ + && pyenv global ${PYTHON_VERSION} \ + && pyenv rehash + +# Install specific version of pip +ARG PIP_VERSION +RUN python -m pip install pip==$PIP_VERSION + +# Install specific version of setuptools +ARG SETUPTOOLS_VERSION +RUN python -m pip install setuptools==$SETUPTOOLS_VERSION + +# Install poetry as all examples use it and therefore it should be available for custom images +ARG POETRY_VERSION +RUN curl -sSL https://install.python-poetry.org | python3 - --version ${POETRY_VERSION} +ENV PATH /root/.local/bin:$PATH +RUN poetry config virtualenvs.create false + +# Server image +FROM base as server + +WORKDIR /app +ARG FLWR_VERSION +RUN python -m pip install -U flwr[rest]==${FLWR_VERSION} +ENTRYPOINT ["python", "-c", "from flwr.server import run_server\nrun_server()"] + +# Test if Flower can be successfully installed and imported +FROM server as test +RUN python -c "from flwr.server import run_server" From 9ad0639b8ea7e7968bc02625539dcbe40db242d0 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Thu, 16 Nov 2023 13:55:25 +0100 Subject: [PATCH 17/64] FDS small docs fix (#2610) --- datasets/doc/source/conf.py | 2 +- datasets/flwr_datasets/federated_dataset.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datasets/doc/source/conf.py b/datasets/doc/source/conf.py index dad5f9f036a1..4fccaf0ef084 100644 --- a/datasets/doc/source/conf.py +++ b/datasets/doc/source/conf.py @@ -37,7 +37,7 @@ author = "The Flower Authors" # The full version, including alpha/beta/rc tags -release = "0.0.1" +release = "0.0.2" # -- General configuration --------------------------------------------------- diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py index d102df2c9ae6..c40f8cc34857 100644 --- a/datasets/flwr_datasets/federated_dataset.py +++ b/datasets/flwr_datasets/federated_dataset.py @@ -49,8 +49,8 @@ class FederatedDataset: partitioners : Dict[str, Union[Partitioner, int]] A dictionary mapping the Dataset split (a `str`) to a `Partitioner` or an `int` (representing the number of IID partitions that this split should be partitioned - into). One or multiple `Partitioner`s can be specified in that manner, but at - most, one per split. + into). One or multiple `Partitioner` objects can be specified in that manner, + but at most, one per split. shuffle : bool Whether to randomize the order of samples. Applied prior to resplitting, speratelly to each of the present splits in the dataset. It uses the `seed` From f84f0781839c2ed8ce504f20c3eb00e6690b8953 Mon Sep 17 00:00:00 2001 From: Javier Date: Thu, 16 Nov 2023 13:24:25 +0000 Subject: [PATCH 18/64] Update FedBN baseline to new format (#2608) --- baselines/fedbn/.gitignore | 2 + baselines/fedbn/LICENSE | 202 ++++++++++++++ baselines/fedbn/README.md | 159 +++++++++++ baselines/fedbn/_static/train_loss.png | Bin 0 -> 52231 bytes baselines/fedbn/docs/multirun_plot.ipynb | 220 +++++++++++++++ baselines/fedbn/fedbn/__init__.py | 1 + baselines/fedbn/fedbn/client.py | 189 +++++++++++++ baselines/fedbn/fedbn/conf/base.yaml | 41 +++ baselines/fedbn/fedbn/conf/client/fedavg.yaml | 4 + baselines/fedbn/fedbn/conf/client/fedbn.yaml | 4 + baselines/fedbn/fedbn/dataset.py | 263 ++++++++++++++++++ baselines/fedbn/fedbn/dataset_preparation.py | 8 + baselines/fedbn/fedbn/main.py | 86 ++++++ baselines/fedbn/fedbn/models.py | 114 ++++++++ baselines/fedbn/fedbn/server.py | 5 + baselines/fedbn/fedbn/strategy.py | 85 ++++++ baselines/fedbn/fedbn/utils.py | 76 +++++ baselines/fedbn/pyproject.toml | 143 ++++++++++ doc/source/ref-changelog.md | 2 + 19 files changed, 1604 insertions(+) create mode 100644 baselines/fedbn/.gitignore create mode 100644 baselines/fedbn/LICENSE create mode 100644 baselines/fedbn/README.md create mode 100644 baselines/fedbn/_static/train_loss.png create mode 100644 baselines/fedbn/docs/multirun_plot.ipynb create mode 100644 baselines/fedbn/fedbn/__init__.py create mode 100644 baselines/fedbn/fedbn/client.py create mode 100644 baselines/fedbn/fedbn/conf/base.yaml create mode 100644 baselines/fedbn/fedbn/conf/client/fedavg.yaml create mode 100644 baselines/fedbn/fedbn/conf/client/fedbn.yaml create mode 100644 baselines/fedbn/fedbn/dataset.py create mode 100644 baselines/fedbn/fedbn/dataset_preparation.py create mode 100644 baselines/fedbn/fedbn/main.py create mode 100644 baselines/fedbn/fedbn/models.py create mode 100644 baselines/fedbn/fedbn/server.py create mode 100644 baselines/fedbn/fedbn/strategy.py create mode 100644 baselines/fedbn/fedbn/utils.py create mode 100644 baselines/fedbn/pyproject.toml diff --git a/baselines/fedbn/.gitignore b/baselines/fedbn/.gitignore new file mode 100644 index 000000000000..de1e160448e5 --- /dev/null +++ b/baselines/fedbn/.gitignore @@ -0,0 +1,2 @@ +outputs/ +multirun/ diff --git a/baselines/fedbn/LICENSE b/baselines/fedbn/LICENSE new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/baselines/fedbn/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/baselines/fedbn/README.md b/baselines/fedbn/README.md new file mode 100644 index 000000000000..cc4f68b90e9e --- /dev/null +++ b/baselines/fedbn/README.md @@ -0,0 +1,159 @@ +--- +title: "FedBN: Federated Learning on Non-IID Features via Local Batch Normalization" +url: https://arxiv.org/abs/2102.07623 +labels: [data heterogeneity, feature shift, cross-silo] +dataset: [MNIST, MNIST-M, SVHN, USPS, SynthDigits] +--- + +# FedBN: Federated Learning on Non-IID Features via Local Batch Normalization + +> Note: If you use this baseline in your work, please remember to cite the original authors of the paper as well as the Flower paper. + + +**Paper:** [arxiv.org/abs/2102.07623](https://arxiv.org/abs/2102.07623) + +**Authors:** Xiaoxiao Li, Meirui Jiang, Xiaofei Zhang, Michael Kamp, Qi Dou + +**Abstract:** The emerging paradigm of federated learning (FL) strives to enable collaborative training of deep models on the network edge without centrally aggregating raw data and hence improving data privacy. In most cases, the assumption of independent and identically distributed samples across local clients does not hold for federated learning setups. Under this setting, neural network training performance may vary significantly according to the data distribution and even hurt training convergence. Most of the previous work has focused on a difference in the distribution of labels or client shifts. Unlike those settings, we address an important problem of FL, e.g., different scanners/sensors in medical imaging, different scenery distribution in autonomous driving (highway vs. city), where local clients store examples with different distributions compared to other clients, which we denote as feature shift non-iid. In this work, we propose an effective method that uses local batch normalization to alleviate the feature shift before averaging models. The resulting scheme, called FedBN, outperforms both classical FedAvg, as well as the state-of-the-art for non-iid data (FedProx) on our extensive experiments. These empirical results are supported by a convergence analysis that shows in a simplified setting that FedBN has a faster convergence rate than FedAvg. + + +## About this baseline + +**What’s implemented:** Figure 3 in the paper: convergence in training loss comparing `FedBN` to `FedAvg` for five datasets. + +**Datasets:** Vision datasets including digits 0-9. These datasets are: [MNIST](https://ieeexplore.ieee.org/document/726791), [MNIST-M](https://arxiv.org/pdf/1505.07818.pdf), [SVHN](http://ufldl.stanford.edu/housenumbers/nips2011_housenumbers.pdf), [USPS](https://ieeexplore.ieee.org/document/291440), and [SynthDigits](https://arxiv.org/pdf/1505.07818.pdf). + +**Hardware Setup:** Using the default configurations, any machine with 8 CPU cores should be capable to run 100 rounds of FedAvg or FedBN in under 5 minutes. Therefore a GPU is not needed if you stick to the small model used in the paper and you limit clients to use a 10% of the data in each dataset (these are the default settings) + +**Contributors:** Meirui Jiang, Maria Boerner, Javier Fernandez-Marques + + +## Experimental Setup + +**Task:** Image classification + +**Model:** A six-layer CNN with 14,219,210 parameters following the structure described in appendix D.2. + +**Dataset:** This baseline makes use of the pre-processed partitions created and open source by the authors of the FedBN paper. You can read more about how those were created [here](https://github.com/med-air/FedBN). Follow the steps below in the `Environment Setup` section to download them. + + +A more detailed explanation of the datasets is given in the following table. + +| | MNIST | MNIST-M | SVHN | USPS | SynthDigits | +|--- |--- |--- |--- |--- |--- | +| data type| handwritten digits| MNIST modification randomly colored with colored patches| Street view house numbers | handwritten digits from envelopes by the U.S. Postal Service | Syntehtic digits Windows TM font varying the orientation, blur and stroke colors | +| color | greyscale | RGB | RGB | greyscale | RGB | +| pixelsize | 28x28 | 28 x 28 | 32 x32 | 16 x16 | 32 x32 | +| labels | 0-9 | 0-9 | 1-10 | 0-9 | 1-10 | +| number of trainset | 60.000 | 60.000 | 73.257 | 9,298 | 50.000 | +| number of testset| 10.000 | 10.000 | 26.032 | - | - | +| image shape | (28,28) | (28,28,3) | (32,32,3) | (16,16) | (32,32,3) | + + +**Training Hyperparameters:** By default (i.e. if you don't override anything in the config) these main hyperparameters used are shown in the table below. For a complete list of hyperparameters, please refer to the config files in `fedbn/conf`. + +| Description | Value | +| ----------- | ----- | +| rounds | 10 | +| num_clients | 5 | +| strategy_fraction_fit | 1.0 | +| strategy.fraction_evaluate | 0.0 | +| training samples per client| 743 | +| lr | 10E-2 | +| local epochs | 1 | +| loss | cross entropy loss | +| optimizer | SGD | +| client_resources.num_cpu | 2 | +| client_resources.num_gpus | 0.0 | + +## Environment Setup + +To construct the Python environment, simply run: + +```bash +# Set directory to use python 3.10 (install with `pyenv install ` if you don't have it) +pyenv local 3.10.6 + +# Tell poetry to use python3.10 +poetry env use 3.10.6 + +# Install +poetry install +``` + +Before running the experiments you'll need to download the five datasets for this baseline. We'll be using the pre-processed datasets created by the `FedBN` authors. Download the dataset from [here](https://mycuhk-my.sharepoint.com/:u:/g/personal/1155149226_link_cuhk_edu_hk/EV1YgHfFC4RKjw06NL4JMdgBMU21CegM12SpXrSmGjt3XA?e=XK2rFs) and move the file into a new directory named `data`. +```bash +mkdir data +mv data/ + +# now uncompress the zipfile +cd data && unzip digit_dataset.zip +cd data .. +``` + +## Running the Experiments + +First, activate your environment via `poetry shell`. The commands below show how to run the experiments and modify some of its key hyperparameters via the cli. Each time you run an experiment, the log and results will be stored inside `outputs//