From 36eb11c0cae940f6bbd834ddff664f8a71748a16 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Wed, 20 Sep 2023 12:35:07 +0200 Subject: [PATCH] Add FDS how-to guides (#2332) --- .../how-to-disable-enable-progress-bar.rst | 16 ++++ .../source/how-to-install-flwr-datasets.rst | 46 ++++++++++++ datasets/doc/source/how-to-use-with-numpy.rst | 61 +++++++++++++++ .../doc/source/how-to-use-with-pytorch.rst | 67 +++++++++++++++++ .../doc/source/how-to-use-with-tensorflow.rst | 74 +++++++++++++++++++ 5 files changed, 264 insertions(+) create mode 100644 datasets/doc/source/how-to-disable-enable-progress-bar.rst create mode 100644 datasets/doc/source/how-to-install-flwr-datasets.rst create mode 100644 datasets/doc/source/how-to-use-with-numpy.rst create mode 100644 datasets/doc/source/how-to-use-with-pytorch.rst create mode 100644 datasets/doc/source/how-to-use-with-tensorflow.rst diff --git a/datasets/doc/source/how-to-disable-enable-progress-bar.rst b/datasets/doc/source/how-to-disable-enable-progress-bar.rst new file mode 100644 index 000000000000..95a9c7a562b1 --- /dev/null +++ b/datasets/doc/source/how-to-disable-enable-progress-bar.rst @@ -0,0 +1,16 @@ +Disable/Enable Progress Bar +=========================== + +You will see a progress bar by default when you download a dataset or apply a map function. Here is how you control +this behavior. + +Disable:: + + from datasets.utils.logging import disable_progress_bar + disable_progress_bar() + +Enable:: + + from datasets.utils.logging import enable_progress_bar + enable_progress_bar() + diff --git a/datasets/doc/source/how-to-install-flwr-datasets.rst b/datasets/doc/source/how-to-install-flwr-datasets.rst new file mode 100644 index 000000000000..d2fd7923a817 --- /dev/null +++ b/datasets/doc/source/how-to-install-flwr-datasets.rst @@ -0,0 +1,46 @@ +Installation +============ + +Python Version +-------------- + +Flower Datasets requires `Python 3.8 `_ or above. + + +Install stable release (pip) +---------------------------- + +Stable releases are available on `PyPI `_ + +.. code-block:: bash + + python -m pip install flwr-datasets + +For vision datasets (e.g. MNIST, CIFAR10) ``flwr-datasets`` should be installed with the ``vision`` extra + +.. code-block:: bash + + python -m pip install flwr_datasets[vision] + +For audio datasets (e.g. Speech Command) ``flwr-datasets`` should be installed with the ``audio`` extra + +.. code-block:: bash + + python -m pip install flwr_datasets[audio] + + +Verify installation +------------------- + +The following command can be used to verify if Flower Datasets was successfully installed: + +.. code-block:: bash + + python -c "import flwr_datasets;print(flwr_datasets.__version__)" + +If everything worked, it should print the version of Flower Datasets to the command line: + +.. code-block:: none + + 0.0.1 + diff --git a/datasets/doc/source/how-to-use-with-numpy.rst b/datasets/doc/source/how-to-use-with-numpy.rst new file mode 100644 index 000000000000..c3fbf85969e3 --- /dev/null +++ b/datasets/doc/source/how-to-use-with-numpy.rst @@ -0,0 +1,61 @@ +Use with NumPy +============== + +Let's integrate ``flwr-datasets`` with NumPy. + +Prepare the desired partitioning:: + + from flwr_datasets import FederatedDataset + + fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10}) + partition = fds.load_partition(0, "train") + centralized_dataset = fds.load_full("test") + +Transform to NumPy:: + + partition_np = partition.with_format("numpy") + X_train, y_train = partition_np["img"], partition_np["label"] + +That's all. Let's check the dimensions and data types of our ``X_train`` and ``y_train``:: + + print(f"The shape of X_train is: {X_train.shape}, dtype: {X_train.dtype}.") + print(f"The shape of y_train is: {y_train.shape}, dtype: {y_train.dtype}.") + +You should see:: + + The shape of X_train is: (500, 32, 32, 3), dtype: uint8. + The shape of y_train is: (500,), dtype: int64. + +Note that the ``X_train`` values are of type ``uint8``. It is not a problem for the TensorFlow model when passing the +data as input, but it might remind us to normalize the data - global normalization, pre-channel normalization, or simply +rescale the data to [0, 1] range:: + + X_train = (X_train - X_train.mean()) / X_train.std() # Global normalization + + +CNN Keras model +--------------- +Here's a quick example of how you can use that data with a simple CNN model:: + + import tensorflow as tf + from tensorflow.keras import datasets, layers, models + + model = models.Sequential([ + layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)), + layers.MaxPooling2D(2, 2), + layers.Conv2D(64, (3, 3), activation='relu'), + layers.MaxPooling2D(2, 2), + layers.Conv2D(64, (3, 3), activation='relu'), + layers.Flatten(), + layers.Dense(64, activation='relu'), + layers.Dense(10, activation='softmax') + ]) + + model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + model.fit(X_train, y_train, epochs=20, batch_size=64) + +You should see about 98% accuracy on the training data at the end of the training. + +Note that we used ``"sparse_categorical_crossentropy"``. Make sure to keep it that way if you don't want to one-hot-encode +the labels. diff --git a/datasets/doc/source/how-to-use-with-pytorch.rst b/datasets/doc/source/how-to-use-with-pytorch.rst new file mode 100644 index 000000000000..5981f88c26b8 --- /dev/null +++ b/datasets/doc/source/how-to-use-with-pytorch.rst @@ -0,0 +1,67 @@ +Use with PyTorch +================ +Let's integrate ``flwr-datasets`` with PyTorch DataLoaders and keep your PyTorch Transform applied to the data. + +Standard setup - download the dataset, choose the partitioning:: + + from flwr_datasets import FederatedDataset + + fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10}) + partition = fds.load_partition(0, "train") + centralized_dataset = fds.load_full("test") + +Determine the names of our features (you can alternatively do that directly on the Hugging Face website). The name can +vary e.g. "img" or "image", "label" or "labels":: + + partition.features + +In case of CIFAR10, you should see the following output + +.. code-block:: none + + {'img': Image(decode=True, id=None), + 'label': ClassLabel(names=['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', + 'frog', 'horse', 'ship', 'truck'], id=None)} + +Apply Transforms, Create DataLoader. We will use the `map() `_ +function. Please note that the map will modify the existing dataset if the key in the dictionary you return is already present +and append a new feature if it did not exist before. Below, we modify the "img" feature of our dataset.:: + + from torch.utils.data import DataLoader + from torchvision.transforms import ToTensor + + transforms = ToTensor() + partition_torch = partition.map( + lambda img: {"img": transforms(img)}, input_columns="img" + ).with_format("torch") + dataloader = DataLoader(partition_torch, batch_size=64) + +We advise you to keep the +`ToTensor() `_ transform (especially if +you used it in your PyTorch code) because it swaps the dimensions from (H x W x C) to (C x H x W). This order is +expected by a model with a convolutional layer. + +If you want to divide the dataset, you can use (at any point before passing the dataset to the DataLoader):: + + partition_train_test = partition.train_test_split(test_size=0.2) + partition_train = partition_train_test["train"] + partition_test = partition_train_test["test"] + +Or you can simply calculate the indices yourself:: + + partition_len = len(partition) + partition_train = partition[:int(0.8 * partition_len)] + partition_test = partition[int(0.8 * partition_len):] + +And during the training loop, you need to apply one change. With a typical dataloader, you get a list returned for each iteration:: + + for batch in all_from_pytorch_dataloader: + images, labels = batch + # Or alternatively: + # images, labels = batch[0], batch[1] + +With this dataset, you get a dictionary, and you access the data a little bit differently (via keys not by index):: + + for batch in dataloader: + images, labels = batch["img"], batch["label"] + diff --git a/datasets/doc/source/how-to-use-with-tensorflow.rst b/datasets/doc/source/how-to-use-with-tensorflow.rst new file mode 100644 index 000000000000..86a1f4e0da8a --- /dev/null +++ b/datasets/doc/source/how-to-use-with-tensorflow.rst @@ -0,0 +1,74 @@ +Use with TensorFlow +=================== + +Let's integrate ``flwr-datasets`` with TensorFlow. We show you three ways how to convert the data into the formats +that ``TensorFlow``'s models expect. Please note that, especially for the smaller datasets, the performance of the +following methods is very close. We recommend you choose the method you are the most comfortable with. + +NumPy +----- +The first way is to transform the data into the NumPy arrays. It's an easier option that is commonly used. Feel free to +follow the :doc:`how-to-use-with-numpy` tutorial, especially if you are a beginner. + +.. _tensorflow-dataset: + +TensorFlow Dataset +------------------ +Work with ``TensorFlow Dataset`` abstraction. + +Standard setup:: + + from flwr_datasets import FederatedDataset + + fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10}) + partition = fds.load_partition(0, "train") + centralized_dataset = fds.load_full("test") + +Transformation to the TensorFlow Dataset:: + + tf_dataset = partition.to_tf_dataset(columns="img", label_cols="label", batch_size=64, + shuffle=True) + # Assuming you have defined your model and compiled it + model.fit(tf_dataset, epochs=20) + +TensorFlow Tensors +------------------ +Change the data type to TensorFlow Tensors (it's not the TensorFlow dataset). + +Standard setup:: + + from flwr_datasets import FederatedDataset + + fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10}) + partition = fds.load_partition(0, "train") + centralized_dataset = fds.load_full("test") + +Transformation to the TensorFlow Tensors :: + + data_tf = partition.with_format("tf") + # Assuming you have defined your model and compiled it + model.fit(data_tf["img"], data_tf["label"], epochs=20, batch_size=64) + +CNN Keras Model +--------------- +Here's a quick example of how you can use that data with a simple CNN model (it assumes you created the TensorFlow +dataset as in the section above, see :ref:`TensorFlow Dataset `):: + + import tensorflow as tf + from tensorflow.keras import datasets, layers, models + + model = models.Sequential([ + layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)), + layers.MaxPooling2D(2, 2), + layers.Conv2D(64, (3, 3), activation='relu'), + layers.MaxPooling2D(2, 2), + layers.Conv2D(64, (3, 3), activation='relu'), + layers.Flatten(), + layers.Dense(64, activation='relu'), + layers.Dense(10, activation='softmax') + ]) + + model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', + metrics=['accuracy']) + model.fit(tf_dataset, epochs=20) +