Merge branch 'main' into minimal_ray

adap · Sep 28, 2023 · 29dd1a7 · 29dd1a7
2 parents 0ee17fa + 4a8ac49
commit 29dd1a7
Show file tree

Hide file tree

Showing 13 changed files with 284 additions and 22 deletions.
diff --git a/.github/workflows/datasets.yml b/.github/workflows/datasets.yml
@@ -4,9 +4,13 @@ on:
   push:
     branches:
       - main
+    paths:
+      - "datasets/**"
   pull_request:
     branches:
       - main
+    paths:
+      - "datasets/**"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number || github.ref }}

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -21,6 +21,8 @@ jobs:
     name: Build and deploy
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - name: Bootstrap
         uses: ./.github/actions/bootstrap
       - name: Install pandoc
@@ -39,3 +41,4 @@ jobs:
           aws s3 sync --delete --exclude ".*" --exclude "v/*" --cache-control "no-cache" ./doc/build/html/ s3://flower.dev/docs/framework
           aws s3 sync --delete --exclude ".*" --exclude "v/*" --cache-control "no-cache" ./baselines/doc/build/html/ s3://flower.dev/docs/baselines
           aws s3 sync --delete --exclude ".*" --exclude "v/*" --cache-control "no-cache" ./examples/doc/build/html/ s3://flower.dev/docs/examples
+          aws s3 sync --delete --exclude ".*" --exclude "v/*" --cache-control "no-cache" ./datasets/doc/build/html/ s3://flower.dev/docs/datasets
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -16,9 +16,39 @@ env:
   FLWR_TELEMETRY_ENABLED: 0
 
 jobs:
+  wheel:
+    runs-on: ubuntu-22.04
+    name: Build, test and upload wheel
+    steps:
+      - uses: actions/checkout@v3
+      - name: Bootstrap
+        uses: ./.github/actions/bootstrap
+      - name: Install dependencies (mandatory only)
+        run: python -m poetry install
+      - name: Build wheel
+        run: ./dev/build.sh
+      - name: Test wheel
+        run: ./dev/test-wheel.sh
+      - name: Upload wheel
+        id: upload
+        env:
+          AWS_DEFAULT_REGION: ${{ secrets. AWS_DEFAULT_REGION }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets. AWS_SECRET_ACCESS_KEY }}
+        run: |
+          cd ./dist
+          echo "WHL_PATH=$(ls *.whl)" >> "$GITHUB_OUTPUT"
+          sha_short=$(git rev-parse --short HEAD)
+          echo "SHORT_SHA=$sha_short" >> "$GITHUB_OUTPUT"
+          aws s3 cp --content-disposition "attachment" --cache-control "no-cache" ./ s3://artifact.flower.dev/py/${{ github.head_ref }}/$sha_short --recursive
+    outputs:
+      whl_path: ${{ steps.upload.outputs.WHL_PATH }}
+      short_sha: ${{ steps.upload.outputs.SHORT_SHA }}
+
   frameworks:
     runs-on: ubuntu-22.04
     timeout-minutes: 10
+    needs: wheel
     # Using approach described here:
     # https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs
     strategy:
@@ -89,6 +119,9 @@ jobs:
           python-version: 3.8
       - name: Install dependencies
         run: python -m poetry install
+      - name: Install Flower wheel from artifact store
+        run: |
+          python -m pip install https://artifact.flower.dev/py/${{ github.head_ref }}/${{ needs.wheel.outputs.short_sha }}/${{ needs.wheel.outputs.whl_path }}
       - name: Download dataset
         if: ${{ matrix.dataset }}
         run: python -c "${{ matrix.dataset }}"
@@ -102,6 +135,7 @@ jobs:
   strategies:
     runs-on: ubuntu-22.04
     timeout-minutes: 10
+    needs: wheel
     strategy:
       matrix:
         strat: ["FedMedian", "FedTrimmedAvg", "QFedAvg", "FaultTolerantFedAvg", "FedAvgM", "FedAdam", "FedAdagrad", "FedYogi"]
@@ -119,6 +153,9 @@ jobs:
       - name: Install dependencies
         run: |
           python -m poetry install
+      - name: Install Flower wheel from artifact store
+        run: |
+          python -m pip install https://artifact.flower.dev/py/${{ github.head_ref }}/${{ needs.wheel.outputs.short_sha }}/${{ needs.wheel.outputs.whl_path }}
       - name: Cache Datasets
         uses: actions/cache@v3
         with:

diff --git a/datasets/doc/source/index.rst b/datasets/doc/source/index.rst
@@ -1,8 +1,93 @@
-Flower Datasets Documentation
-=============================
+Flower Datasets
+===============
+
+Flower Datasets (``flwr-datasets``) is a library to quickly and easily create datasets for federated
+learning/analytics/evaluation. It is created by the ``Flower Labs`` team that also created `Flower <https://flower.dev>`_ - a Friendly Federated Learning Framework.
+
+Flower Datasets Framework
+-------------------------
+
+Tutorials
+~~~~~~~~~
+
+A learning-oriented series of tutorials is the best place to start.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Tutorial
+
+   tutorial-quickstart
+
+How-to guides
+~~~~~~~~~~~~~
+
+Problem-oriented how-to guides show step-by-step how to achieve a specific goal.
+
+.. toctree::
+   :maxdepth: 1
+   :caption: How-to guides
+
+   how-to-install-flwr-datasets
+   how-to-use-with-pytorch
+   how-to-use-with-tensorflow
+   how-to-use-with-numpy
+   how-to-disable-enable-progress-bar
+
+References
+~~~~~~~~~~
+
+Information-oriented API reference and other reference material.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: API reference
+
+   ref-api-flwr-datasets
+
+Main features
+-------------
+Flower Datasets library supports:
+
+- **downloading datasets** - choose the dataset from Hugging Face's ``dataset``
+- **partitioning datasets** - customize the partitioning scheme
+- **creating centralized datasets** - leave parts of the dataset unpartitioned (e.g. for centralized evaluation)
 
-Welcome to Flower Datasets' documentation. `Flower <https://flower.dev>`_ is a friendly federated learning framework.
+Thanks to using Hugging Face's ``datasets`` used under the hood, Flower Datasets integrates with the following popular formats/frameworks:
 
+- Hugging Face
+- PyTorch
+- TensorFlow
+- Numpy
+- Pandas
+- Jax
+- Arrow
+
+Install
+-------
+
+The simplest install is
+
+.. code-block:: bash
+
+  python -m pip install flwr-datasets
+
+If you plan to use the image datasets
+
+.. code-block:: bash
+
+  python -m pip install flwr-datasets[vision]
+
+If you plan to use the audio datasets
+
+.. code-block:: bash
+
+  python -m pip install flwr-datasets[audio]
+
+Check out the full details on the download in :doc:`how-to-install-flwr-datasets`.
+
+How To Use the library
+----------------------
+Learn how to use the ``flwr-datasets`` library from the :doc:`tutorial-quickstart` examples .
 
 Join the Flower Community
 -------------------------
@@ -14,9 +99,3 @@ The Flower Community is growing quickly - we're a friendly group of researchers,
     :shadow:
 
     Join us on Slack
-
-
-Flower Datasets
----------------
-
-
diff --git a/datasets/doc/source/tutorial-quickstart.rst b/datasets/doc/source/tutorial-quickstart.rst
@@ -5,17 +5,23 @@ Run Flower Datasets as fast as possible by learning only the essentials.
 
 Install Federated Datasets
 --------------------------
-Run on the command line::
+Run on the command line
+
+.. code-block:: bash
 
   python -m pip install flwr-datasets[vision]
 
 Install the ML framework
 ------------------------
-TensorFlow::
+TensorFlow
+
+.. code-block:: bash
 
   pip install tensorflow
 
-PyTorch::
+PyTorch
+
+.. code-block:: bash
 
   pip install torch torchvision
 
@@ -41,7 +47,7 @@ supported by your framework.
 
 Conversion
 ----------
-For more detailed instructions, go to :doc:`how-to`.
+For more detailed instructions, go to :doc:`how-to-use-with-pytorch`.
 
 PyTorch DataLoader
 ^^^^^^^^^^^^^^^^^^

diff --git a/datasets/e2e/tensorflow/pyproject.toml b/datasets/e2e/tensorflow/pyproject.toml
@@ -0,0 +1,15 @@
+[build-system]
+requires = ["poetry-core>=1.4.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry]
+name = "fds-e2-tensorflow"
+version = "0.1.0"
+description = "Flower Datasets with TensorFlow"
+authors = ["The Flower Authors <[email protected]>"]
+
+[tool.poetry.dependencies]
+python = "^3.8"
+flwr-datasets = { path = "./../../", extras = ["vision"] }
+tensorflow-cpu = "^2.9.1, !=2.11.1"
+parameterized = "==0.9.0"
diff --git a/datasets/e2e/tensorflow/tensorflow_test.py b/datasets/e2e/tensorflow/tensorflow_test.py
@@ -0,0 +1,102 @@
+import unittest
+
+import numpy as np
+import tensorflow as tf
+from datasets.utils.logging import disable_progress_bar
+from parameterized import parameterized_class, parameterized
+from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
+from tensorflow.keras.models import Sequential
+
+from flwr_datasets import FederatedDataset
+
+
+def SimpleCNN():
+    model = Sequential([
+        Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)),
+        MaxPooling2D(2, 2),
+        Conv2D(64, (3, 3), activation='relu'),
+        MaxPooling2D(2, 2),
+        Flatten(),
+        Dense(64, activation='relu'),
+        Dense(10, activation='softmax')
+    ])
+    return model
+
+
+@parameterized_class(
+    [
+        {"dataset_name": "cifar10", "test_split": "test"},
+        {"dataset_name": "cifar10", "test_split": "test"},
+    ]
+)
+class FdsToTensorFlow(unittest.TestCase):
+    """Test the conversion from FDS to PyTorch Dataset and Dataloader."""
+
+    dataset_name = ""
+    test_split = ""
+    expected_img_shape_after_transform = [32, 32, 3]
+
+    @classmethod
+    def setUpClass(cls):
+        """Disable progress bar to keep the log clean.
+        """
+        disable_progress_bar()
+
+    def _create_tensorflow_dataset(self, batch_size: int) -> tf.data.Dataset:
+        """Create a tensorflow dataset from the FederatedDataset."""
+        partition_id = 0
+        fds = FederatedDataset(dataset=self.dataset_name, partitioners={"train": 100})
+        partition = fds.load_partition(partition_id, "train")
+        tf_dataset = partition.to_tf_dataset(columns="img", label_cols="label",
+                                             batch_size=batch_size,
+                                             shuffle=False)
+        return tf_dataset
+
+    def test_create_partition_dataset_shape(self) -> None:
+        """Test if the DataLoader returns batches with the expected shape."""
+        batch_size = 16
+        dataset = self._create_tensorflow_dataset(batch_size)
+        batch = next(iter(dataset))
+        images = batch[0]
+        self.assertEqual(tuple(images.shape),
+                         (batch_size, *self.expected_img_shape_after_transform))
+
+    def test_create_partition_dataloader_with_transforms_batch_type(self) -> None:
+        """Test if the DataLoader returns batches of type dictionary."""
+        batch_size = 16
+        dataset = self._create_tensorflow_dataset(batch_size)
+        batch = next(iter(dataset))
+        self.assertIsInstance(batch, tuple)
+
+    def test_create_partition_dataloader_with_transforms_data_type(self) -> None:
+        """Test to verify if the data in the DataLoader batches are of type Tensor."""
+        batch_size = 16
+        dataset = self._create_tensorflow_dataset(batch_size)
+        batch = next(iter(dataset))
+        images = batch[0]
+        self.assertIsInstance(images, tf.Tensor)
+
+    @parameterized.expand([
+        ("not_nan", np.isnan),
+        ("not_inf", np.isinf),
+    ])
+    def test_train_model_loss_value(self, name, condition_func):
+        model = SimpleCNN()
+        model.compile(optimizer='adam',
+                      loss='sparse_categorical_crossentropy',
+                      metrics=['accuracy'])
+
+        dataset = self._create_tensorflow_dataset(16)
+
+        # Perform a single epoch of training
+        history = model.fit(dataset, epochs=1, verbose=0)
+
+        # Fetch the last loss from history
+        last_loss = history.history['loss'][-1]
+
+        # Check if the last loss is NaN or Infinity
+        self.assertFalse(condition_func(last_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/datasets/flwr_datasets/partitioner/iid_partitioner.py b/datasets/flwr_datasets/partitioner/iid_partitioner.py
@@ -48,5 +48,5 @@ def load_partition(self, idx: int) -> datasets.Dataset:
             single dataset partition
         """
         return self.dataset.shard(
-            num_shards=self._num_partitions, index=idx, contiguous=True
+            num_shards=self._num_partitions, index=idx, contiguous=False
         )
diff --git a/datasets/flwr_datasets/partitioner/iid_partitioner_test.py b/datasets/flwr_datasets/partitioner/iid_partitioner_test.py
@@ -18,6 +18,7 @@
 import unittest
 from typing import Tuple
 
+import numpy as np
 from parameterized import parameterized
 
 from datasets import Dataset
@@ -100,11 +101,16 @@ def test_load_partition_correct_data(
         self, num_partitions: int, num_rows: int
     ) -> None:
         """Test if the data in partition is equal to the expected."""
-        _, partitioner = _dummy_setup(num_partitions, num_rows)
-        partition_size = num_rows // num_partitions
+        dataset, partitioner = _dummy_setup(num_partitions, num_rows)
         partition_index = 2
         partition = partitioner.load_partition(partition_index)
-        self.assertEqual(partition["features"][0], partition_index * partition_size)
+        row_id = 0
+        self.assertEqual(
+            partition["features"][row_id],
+            dataset[np.arange(partition_index, len(dataset), num_partitions)][
+                "features"
+            ][row_id],
+        )
 
     @parameterized.expand(  # type: ignore
         [