diff --git a/.github/workflows/datasets.yml b/.github/workflows/datasets.yml index 60eb9e49db45..47e9f2aed926 100644 --- a/.github/workflows/datasets.yml +++ b/.github/workflows/datasets.yml @@ -4,9 +4,13 @@ on: push: branches: - main + paths: + - "datasets/**" pull_request: branches: - main + paths: + - "datasets/**" concurrency: group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 52f9a49a259a..78b04c5138d4 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -21,6 +21,8 @@ jobs: name: Build and deploy steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Bootstrap uses: ./.github/actions/bootstrap - name: Install pandoc @@ -39,3 +41,4 @@ jobs: aws s3 sync --delete --exclude ".*" --exclude "v/*" --cache-control "no-cache" ./doc/build/html/ s3://flower.dev/docs/framework aws s3 sync --delete --exclude ".*" --exclude "v/*" --cache-control "no-cache" ./baselines/doc/build/html/ s3://flower.dev/docs/baselines aws s3 sync --delete --exclude ".*" --exclude "v/*" --cache-control "no-cache" ./examples/doc/build/html/ s3://flower.dev/docs/examples + aws s3 sync --delete --exclude ".*" --exclude "v/*" --cache-control "no-cache" ./datasets/doc/build/html/ s3://flower.dev/docs/datasets diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 3b70db43a6c8..214c8579d450 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -16,9 +16,39 @@ env: FLWR_TELEMETRY_ENABLED: 0 jobs: + wheel: + runs-on: ubuntu-22.04 + name: Build, test and upload wheel + steps: + - uses: actions/checkout@v3 + - name: Bootstrap + uses: ./.github/actions/bootstrap + - name: Install dependencies (mandatory only) + run: python -m poetry install + - name: Build wheel + run: ./dev/build.sh + - name: Test wheel + run: ./dev/test-wheel.sh + - name: Upload wheel + id: upload + env: + AWS_DEFAULT_REGION: ${{ secrets. AWS_DEFAULT_REGION }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets. AWS_SECRET_ACCESS_KEY }} + run: | + cd ./dist + echo "WHL_PATH=$(ls *.whl)" >> "$GITHUB_OUTPUT" + sha_short=$(git rev-parse --short HEAD) + echo "SHORT_SHA=$sha_short" >> "$GITHUB_OUTPUT" + aws s3 cp --content-disposition "attachment" --cache-control "no-cache" ./ s3://artifact.flower.dev/py/${{ github.head_ref }}/$sha_short --recursive + outputs: + whl_path: ${{ steps.upload.outputs.WHL_PATH }} + short_sha: ${{ steps.upload.outputs.SHORT_SHA }} + frameworks: runs-on: ubuntu-22.04 timeout-minutes: 10 + needs: wheel # Using approach described here: # https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs strategy: @@ -89,6 +119,9 @@ jobs: python-version: 3.8 - name: Install dependencies run: python -m poetry install + - name: Install Flower wheel from artifact store + run: | + python -m pip install https://artifact.flower.dev/py/${{ github.head_ref }}/${{ needs.wheel.outputs.short_sha }}/${{ needs.wheel.outputs.whl_path }} - name: Download dataset if: ${{ matrix.dataset }} run: python -c "${{ matrix.dataset }}" @@ -102,6 +135,7 @@ jobs: strategies: runs-on: ubuntu-22.04 timeout-minutes: 10 + needs: wheel strategy: matrix: strat: ["FedMedian", "FedTrimmedAvg", "QFedAvg", "FaultTolerantFedAvg", "FedAvgM", "FedAdam", "FedAdagrad", "FedYogi"] @@ -119,6 +153,9 @@ jobs: - name: Install dependencies run: | python -m poetry install + - name: Install Flower wheel from artifact store + run: | + python -m pip install https://artifact.flower.dev/py/${{ github.head_ref }}/${{ needs.wheel.outputs.short_sha }}/${{ needs.wheel.outputs.whl_path }} - name: Cache Datasets uses: actions/cache@v3 with: diff --git a/datasets/doc/source/index.rst b/datasets/doc/source/index.rst index 81a08286b6fd..7b19624b341a 100644 --- a/datasets/doc/source/index.rst +++ b/datasets/doc/source/index.rst @@ -1,8 +1,93 @@ -Flower Datasets Documentation -============================= +Flower Datasets +=============== + +Flower Datasets (``flwr-datasets``) is a library to quickly and easily create datasets for federated +learning/analytics/evaluation. It is created by the ``Flower Labs`` team that also created `Flower `_ - a Friendly Federated Learning Framework. + +Flower Datasets Framework +------------------------- + +Tutorials +~~~~~~~~~ + +A learning-oriented series of tutorials is the best place to start. + +.. toctree:: + :maxdepth: 1 + :caption: Tutorial + + tutorial-quickstart + +How-to guides +~~~~~~~~~~~~~ + +Problem-oriented how-to guides show step-by-step how to achieve a specific goal. + +.. toctree:: + :maxdepth: 1 + :caption: How-to guides + + how-to-install-flwr-datasets + how-to-use-with-pytorch + how-to-use-with-tensorflow + how-to-use-with-numpy + how-to-disable-enable-progress-bar + +References +~~~~~~~~~~ + +Information-oriented API reference and other reference material. + +.. toctree:: + :maxdepth: 2 + :caption: API reference + + ref-api-flwr-datasets + +Main features +------------- +Flower Datasets library supports: + +- **downloading datasets** - choose the dataset from Hugging Face's ``dataset`` +- **partitioning datasets** - customize the partitioning scheme +- **creating centralized datasets** - leave parts of the dataset unpartitioned (e.g. for centralized evaluation) -Welcome to Flower Datasets' documentation. `Flower `_ is a friendly federated learning framework. +Thanks to using Hugging Face's ``datasets`` used under the hood, Flower Datasets integrates with the following popular formats/frameworks: +- Hugging Face +- PyTorch +- TensorFlow +- Numpy +- Pandas +- Jax +- Arrow + +Install +------- + +The simplest install is + +.. code-block:: bash + + python -m pip install flwr-datasets + +If you plan to use the image datasets + +.. code-block:: bash + + python -m pip install flwr-datasets[vision] + +If you plan to use the audio datasets + +.. code-block:: bash + + python -m pip install flwr-datasets[audio] + +Check out the full details on the download in :doc:`how-to-install-flwr-datasets`. + +How To Use the library +---------------------- +Learn how to use the ``flwr-datasets`` library from the :doc:`tutorial-quickstart` examples . Join the Flower Community ------------------------- @@ -14,9 +99,3 @@ The Flower Community is growing quickly - we're a friendly group of researchers, :shadow: Join us on Slack - - -Flower Datasets ---------------- - - diff --git a/datasets/doc/source/tutorial-quickstart.rst b/datasets/doc/source/tutorial-quickstart.rst index 69d42f16a3b6..8a70ee8854be 100644 --- a/datasets/doc/source/tutorial-quickstart.rst +++ b/datasets/doc/source/tutorial-quickstart.rst @@ -5,17 +5,23 @@ Run Flower Datasets as fast as possible by learning only the essentials. Install Federated Datasets -------------------------- -Run on the command line:: +Run on the command line + +.. code-block:: bash python -m pip install flwr-datasets[vision] Install the ML framework ------------------------ -TensorFlow:: +TensorFlow + +.. code-block:: bash pip install tensorflow -PyTorch:: +PyTorch + +.. code-block:: bash pip install torch torchvision @@ -41,7 +47,7 @@ supported by your framework. Conversion ---------- -For more detailed instructions, go to :doc:`how-to`. +For more detailed instructions, go to :doc:`how-to-use-with-pytorch`. PyTorch DataLoader ^^^^^^^^^^^^^^^^^^ diff --git a/datasets/e2e/tensorflow/pyproject.toml b/datasets/e2e/tensorflow/pyproject.toml new file mode 100644 index 000000000000..9c5c72c46400 --- /dev/null +++ b/datasets/e2e/tensorflow/pyproject.toml @@ -0,0 +1,15 @@ +[build-system] +requires = ["poetry-core>=1.4.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +name = "fds-e2-tensorflow" +version = "0.1.0" +description = "Flower Datasets with TensorFlow" +authors = ["The Flower Authors "] + +[tool.poetry.dependencies] +python = "^3.8" +flwr-datasets = { path = "./../../", extras = ["vision"] } +tensorflow-cpu = "^2.9.1, !=2.11.1" +parameterized = "==0.9.0" diff --git a/datasets/e2e/tensorflow/tensorflow_test.py b/datasets/e2e/tensorflow/tensorflow_test.py new file mode 100644 index 000000000000..e041bcb8f8cc --- /dev/null +++ b/datasets/e2e/tensorflow/tensorflow_test.py @@ -0,0 +1,102 @@ +import unittest + +import numpy as np +import tensorflow as tf +from datasets.utils.logging import disable_progress_bar +from parameterized import parameterized_class, parameterized +from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense +from tensorflow.keras.models import Sequential + +from flwr_datasets import FederatedDataset + + +def SimpleCNN(): + model = Sequential([ + Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)), + MaxPooling2D(2, 2), + Conv2D(64, (3, 3), activation='relu'), + MaxPooling2D(2, 2), + Flatten(), + Dense(64, activation='relu'), + Dense(10, activation='softmax') + ]) + return model + + +@parameterized_class( + [ + {"dataset_name": "cifar10", "test_split": "test"}, + {"dataset_name": "cifar10", "test_split": "test"}, + ] +) +class FdsToTensorFlow(unittest.TestCase): + """Test the conversion from FDS to PyTorch Dataset and Dataloader.""" + + dataset_name = "" + test_split = "" + expected_img_shape_after_transform = [32, 32, 3] + + @classmethod + def setUpClass(cls): + """Disable progress bar to keep the log clean. + """ + disable_progress_bar() + + def _create_tensorflow_dataset(self, batch_size: int) -> tf.data.Dataset: + """Create a tensorflow dataset from the FederatedDataset.""" + partition_id = 0 + fds = FederatedDataset(dataset=self.dataset_name, partitioners={"train": 100}) + partition = fds.load_partition(partition_id, "train") + tf_dataset = partition.to_tf_dataset(columns="img", label_cols="label", + batch_size=batch_size, + shuffle=False) + return tf_dataset + + def test_create_partition_dataset_shape(self) -> None: + """Test if the DataLoader returns batches with the expected shape.""" + batch_size = 16 + dataset = self._create_tensorflow_dataset(batch_size) + batch = next(iter(dataset)) + images = batch[0] + self.assertEqual(tuple(images.shape), + (batch_size, *self.expected_img_shape_after_transform)) + + def test_create_partition_dataloader_with_transforms_batch_type(self) -> None: + """Test if the DataLoader returns batches of type dictionary.""" + batch_size = 16 + dataset = self._create_tensorflow_dataset(batch_size) + batch = next(iter(dataset)) + self.assertIsInstance(batch, tuple) + + def test_create_partition_dataloader_with_transforms_data_type(self) -> None: + """Test to verify if the data in the DataLoader batches are of type Tensor.""" + batch_size = 16 + dataset = self._create_tensorflow_dataset(batch_size) + batch = next(iter(dataset)) + images = batch[0] + self.assertIsInstance(images, tf.Tensor) + + @parameterized.expand([ + ("not_nan", np.isnan), + ("not_inf", np.isinf), + ]) + def test_train_model_loss_value(self, name, condition_func): + model = SimpleCNN() + model.compile(optimizer='adam', + loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + + dataset = self._create_tensorflow_dataset(16) + + # Perform a single epoch of training + history = model.fit(dataset, epochs=1, verbose=0) + + # Fetch the last loss from history + last_loss = history.history['loss'][-1] + + # Check if the last loss is NaN or Infinity + self.assertFalse(condition_func(last_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/datasets/flwr_datasets/partitioner/iid_partitioner.py b/datasets/flwr_datasets/partitioner/iid_partitioner.py index c8dbf8294fec..37b97468cadf 100644 --- a/datasets/flwr_datasets/partitioner/iid_partitioner.py +++ b/datasets/flwr_datasets/partitioner/iid_partitioner.py @@ -48,5 +48,5 @@ def load_partition(self, idx: int) -> datasets.Dataset: single dataset partition """ return self.dataset.shard( - num_shards=self._num_partitions, index=idx, contiguous=True + num_shards=self._num_partitions, index=idx, contiguous=False ) diff --git a/datasets/flwr_datasets/partitioner/iid_partitioner_test.py b/datasets/flwr_datasets/partitioner/iid_partitioner_test.py index d89eefeba9f2..5f851807f4bd 100644 --- a/datasets/flwr_datasets/partitioner/iid_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/iid_partitioner_test.py @@ -18,6 +18,7 @@ import unittest from typing import Tuple +import numpy as np from parameterized import parameterized from datasets import Dataset @@ -100,11 +101,16 @@ def test_load_partition_correct_data( self, num_partitions: int, num_rows: int ) -> None: """Test if the data in partition is equal to the expected.""" - _, partitioner = _dummy_setup(num_partitions, num_rows) - partition_size = num_rows // num_partitions + dataset, partitioner = _dummy_setup(num_partitions, num_rows) partition_index = 2 partition = partitioner.load_partition(partition_index) - self.assertEqual(partition["features"][0], partition_index * partition_size) + row_id = 0 + self.assertEqual( + partition["features"][row_id], + dataset[np.arange(partition_index, len(dataset), num_partitions)][ + "features" + ][row_id], + ) @parameterized.expand( # type: ignore [ diff --git a/datasets/pyproject.toml b/datasets/pyproject.toml index 6067ff0517db..954441e5d2e4 100644 --- a/datasets/pyproject.toml +++ b/datasets/pyproject.toml @@ -10,7 +10,7 @@ license = "Apache-2.0" authors = ["The Flower Authors "] readme = "README.md" homepage = "https://flower.dev" -repository = "https://github.com/adap/flower/datasets" +repository = "https://github.com/adap/flower" documentation = "https://flower.dev/docs/datasets" keywords = [ "flower", diff --git a/dev/build-docs.sh b/dev/build-docs.sh index ca57536901b2..c464cf908c87 100755 --- a/dev/build-docs.sh +++ b/dev/build-docs.sh @@ -17,3 +17,7 @@ cd examples/doc make docs cd $ROOT +cd datasets/doc +make docs + +cd $ROOT diff --git a/examples/flower-in-30-minutes/tutorial.ipynb b/examples/flower-in-30-minutes/tutorial.ipynb index b1686529c462..336ec4c19644 100644 --- a/examples/flower-in-30-minutes/tutorial.ipynb +++ b/examples/flower-in-30-minutes/tutorial.ipynb @@ -23,7 +23,7 @@ "## Complementary Content\n", "\n", "But before do so, let me point you to a few video tutorials in the [Flower Youtube channel](https://www.youtube.com/@flowerlabs) that you might want to check out after this tutorial. We post new videos fairly regularly with new content:\n", - "* **[VIDEO]** quickstart-tensorflow: [15-min video on how to start with Flower + Tensorflow/Keras](https://www.youtube.com/watch?v=jOmmuzMIQ4c)\n", + "* **[VIDEO]** quickstart-tensorflow: [15-min video on how to start with Flower + Tensorflow/Keras](https://www.youtube.com/watch?v=FGTc2TQq7VM)\n", "* **[VIDEO]** quickstart-pytorch: [20-min video on how to start with Flower + PyTorch](https://www.youtube.com/watch?v=jOmmuzMIQ4c)\n", "* **[VIDEO]** Flower simulation mini-series: [9 line-by-line video tutorials](https://www.youtube.com/watch?v=cRebUIGB5RU&list=PLNG4feLHqCWlnj8a_E1A_n5zr2-8pafTB)" ] diff --git a/examples/secaggplus-mt/driver.py b/examples/secaggplus-mt/driver.py index c168edf070af..4e0a53ed1c91 100644 --- a/examples/secaggplus-mt/driver.py +++ b/examples/secaggplus-mt/driver.py @@ -23,7 +23,7 @@ def merge(_task: task_pb2.Task, _merge_task: task_pb2.Task) -> task_pb2.Task: task_pb2.TaskIns( task_id="", # Do not set, will be created and set by the DriverAPI group_id="", - workload_id="", + workload_id=workload_id, task=merge( task, task_pb2.Task( @@ -84,8 +84,14 @@ def weighted_average(metrics: List[Tuple[int, Metrics]]) -> Metrics: # -------------------------------------------------------------------------- Driver SDK driver.connect() +create_workload_res: driver_pb2.CreateWorkloadResponse = driver.create_workload( + req=driver_pb2.CreateWorkloadRequest() +) # -------------------------------------------------------------------------- Driver SDK +workload_id = create_workload_res.workload_id +print(f"Created workload id {workload_id}") + history = History() for server_round in range(num_rounds): print(f"Commencing server round {server_round + 1}") @@ -113,7 +119,7 @@ def weighted_average(metrics: List[Tuple[int, Metrics]]) -> Metrics: # loop and wait until enough client nodes are available. while True: # Get a list of node ID's from the server - get_nodes_req = driver_pb2.GetNodesRequest() + get_nodes_req = driver_pb2.GetNodesRequest(workload_id=workload_id) # ---------------------------------------------------------------------- Driver SDK get_nodes_res: driver_pb2.GetNodesResponse = driver.get_nodes( @@ -121,7 +127,7 @@ def weighted_average(metrics: List[Tuple[int, Metrics]]) -> Metrics: ) # ---------------------------------------------------------------------- Driver SDK - all_node_ids: List[int] = get_nodes_res.node_ids + all_node_ids: List[int] = [node.node_id for node in get_nodes_res.nodes] if len(all_node_ids) >= num_client_nodes_per_round: # Sample client nodes