Merge branch 'main' into simulation_examples_with_flower_datasets

adap · Nov 23, 2023 · 6d6a54e · 6d6a54e
2 parents ecd824e + bd8c0e0
commit 6d6a54e
Show file tree

Hide file tree

Showing 29 changed files with 14,443 additions and 115 deletions.
diff --git a/datasets/dev/build-flwr-datasets-docs.sh b/datasets/dev/build-flwr-datasets-docs.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Generating the docs, rename and move the files such that the meet the convention used in Flower.
+# Note that it involves two runs of sphinx-build that are necessary.
+# The first run generates the .rst files (and the html files that are discarded)
+# The second time it is run after the files are renamed and moved to the correct place. It generates the final htmls.
+
+set -e
+
+cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )"  >/dev/null 2>&1 && pwd )"/../doc
+
+# Remove the old docs from source/ref-api
+REF_API_DIR="source/ref-api"
+if [[ -d "$REF_API_DIR" ]]; then
+
+  echo "Removing ${REF_API_DIR}"
+  rm -r ${REF_API_DIR}
+fi
+
+# Remove the old html files
+if [[ -d build ]]; then
+  echo "Removing ./build"
+  rm -r build
+fi
+
+# Docs generation: Generate new rst files
+# It starts at the __init__ in the main directory and recursively generated the documentation for the
+# specified classes/modules/packages specified in __all__.
+# Note if a package cannot be reach via the recursive traversal, even if it has __all__, it won't be documented.
+echo "Generating the docs based on only the functionality given in the __all__."
+sphinx-build -M html source build
diff --git a/datasets/doc/source/_templates/autosummary/class.rst b/datasets/doc/source/_templates/autosummary/class.rst
@@ -0,0 +1,33 @@
+{{ name | escape | underline}}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
+   :members:
+   :show-inheritance:
+   :inherited-members:
+
+   {% block methods %}
+
+   {% if methods %}
+   .. rubric:: {{ _('Methods') }}
+
+   .. autosummary::
+   {% for item in methods %}
+      {% if item != "__init__" %}
+         ~{{ name }}.{{ item }}
+      {% endif %}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: {{ _('Attributes') }}
+
+   .. autosummary::
+   {% for item in attributes %}
+      ~{{ name }}.{{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
diff --git a/datasets/doc/source/_templates/autosummary/module.rst b/datasets/doc/source/_templates/autosummary/module.rst
@@ -0,0 +1,66 @@
+{{ name | escape | underline}}
+
+.. automodule:: {{ fullname }}
+
+   {% block attributes %}
+   {% if attributes %}
+   .. rubric:: Module Attributes
+
+   .. autosummary::
+      :toctree:
+   {% for item in attributes %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block functions %}
+   {% if functions %}
+   .. rubric:: {{ _('Functions') }}
+
+   .. autosummary::
+      :toctree:
+   {% for item in functions %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block classes %}
+   {% if classes %}
+   .. rubric:: {{ _('Classes') }}
+
+   .. autosummary::
+      :toctree:
+      :template: autosummary/class.rst
+   {% for item in classes %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+   {% block exceptions %}
+   {% if exceptions %}
+   .. rubric:: {{ _('Exceptions') }}
+
+   .. autosummary::
+      :toctree:
+   {% for item in exceptions %}
+      {{ item }}
+   {%- endfor %}
+   {% endif %}
+   {% endblock %}
+
+{% block modules %}
+{% if modules %}
+.. rubric:: Modules
+
+.. autosummary::
+   :toctree:
+   :template: autosummary/module.rst
+   :recursive:
+{% for item in modules %}
+   {{ item }}
+{%- endfor %}
+{% endif %}
+{% endblock %}
diff --git a/datasets/doc/source/conf.py b/datasets/doc/source/conf.py
@@ -61,8 +61,42 @@
     "nbsphinx",
 ]
 
+# Generate .rst files
 autosummary_generate = True
 
+# Document ONLY the objects from __all__ (present in __init__ files).
+# It will be done recursively starting from flwr_dataset.__init__
+# It's controlled in the index.rst file.
+autosummary_ignore_module_all = False
+
+# Each class and function docs start with the path to it
+# Make the flwr_datasets.federated_dataset.FederatedDataset appear as FederatedDataset
+# The full name is still at the top of the page
+add_module_names = False
+
+def find_test_modules(package_path):
+    """Go through the python files and exclude every *_test.py file."""
+    full_path_modules = []
+    for root, dirs, files in os.walk(package_path):
+        for file in files:
+            if file.endswith('_test.py'):
+                # Construct the module path relative to the package directory
+                full_path = os.path.join(root, file)
+                relative_path = os.path.relpath(full_path, package_path)
+                # Convert file path to dotted module path
+                module_path = os.path.splitext(relative_path)[0].replace(os.sep, '.')
+                full_path_modules.append(module_path)
+    modules = []
+    for full_path_module in full_path_modules:
+        parts = full_path_module.split('.')
+        for i in range(len(parts)):
+            modules.append('.'.join(parts[i:]))
+    return modules
+
+# Stop from documenting the *_test.py files.
+# That's the only way to do that in autosummary (make the modules as mock_imports).
+autodoc_mock_imports = find_test_modules(os.path.abspath("../../"))
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 

diff --git a/datasets/doc/source/how-to-use-with-numpy.rst b/datasets/doc/source/how-to-use-with-numpy.rst
@@ -3,14 +3,30 @@ Use with NumPy
 
 Let's integrate ``flwr-datasets`` with NumPy.
 
-Prepare the desired partitioning::
+Create a ``FederatedDataset``::
 
   from flwr_datasets import FederatedDataset
 
   fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10})
   partition = fds.load_partition(0, "train")
   centralized_dataset = fds.load_full("test")
 
+Inspect the names of the features::
+
+  partition.features
+
+In case of CIFAR10, you should see the following output.
+
+.. code-block:: none
+
+  {'img': Image(decode=True, id=None),
+  'label': ClassLabel(names=['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog',
+  'frog', 'horse', 'ship', 'truck'], id=None)}
+
+We will use the keys in the partition features in order to apply transformations to the data or pass it to a ML model.  Let's move to the transformations.
+
+NumPy
+-----
 Transform to NumPy::
 
   partition_np = partition.with_format("numpy")

diff --git a/datasets/doc/source/how-to-use-with-pytorch.rst b/datasets/doc/source/how-to-use-with-pytorch.rst
@@ -10,7 +10,7 @@ Standard setup - download the dataset, choose the partitioning::
   partition = fds.load_partition(0, "train")
   centralized_dataset = fds.load_full("test")
 
-Determine the names of our features (you can alternatively do that directly on the Hugging Face website). The name can
+Determine the names of the features (you can alternatively do that directly on the Hugging Face website). The name can
 vary e.g. "img" or "image", "label" or "labels"::
 
   partition.features
@@ -38,7 +38,7 @@ That is why we iterate over all the samples from this batch and apply our transf
     return batch
 
   partition_torch = partition.with_transform(apply_transforms)
-  # At this point, you can check if you didn't make any mistakes by calling partition_torch[0]
+  # Now, you can check if you didn't make any mistakes by calling partition_torch[0]
   dataloader = DataLoader(partition_torch, batch_size=64)
 
 
@@ -70,8 +70,10 @@ If you want to divide the dataset, you can use (at any point before passing the
 Or you can simply calculate the indices yourself::
 
   partition_len = len(partition)
-  partition_train = partition[:int(0.8 * partition_len)]
-  partition_test = partition[int(0.8 * partition_len):]
+  # Split `partition` 80:20
+  num_train_examples = int(0.8 * partition_len)
+  partition_train = partition.select(range(num_train_examples)) ) # use first 80% 
+  partition_test = partition.select(range(num_train_examples, partition_len)) ) # use last 20%
 
 And during the training loop, you need to apply one change. With a typical dataloader, you get a list returned for each iteration::
 

diff --git a/datasets/doc/source/how-to-use-with-tensorflow.rst b/datasets/doc/source/how-to-use-with-tensorflow.rst
@@ -1,10 +1,32 @@
 Use with TensorFlow
 ===================
 
-Let's integrate ``flwr-datasets`` with TensorFlow. We show you three ways how to convert the data into the formats
+Let's integrate ``flwr-datasets`` with ``TensorFlow``. We show you three ways how to convert the data into the formats
 that ``TensorFlow``'s models expect.  Please note that, especially for the smaller datasets, the performance of the
 following methods is very close. We recommend you choose the method you are the most comfortable with.
 
+Create a ``FederatedDataset``::
+
+  from flwr_datasets import FederatedDataset
+
+  fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10})
+  partition = fds.load_partition(0, "train")
+  centralized_dataset = fds.load_full("test")
+
+Inspect the names of the features::
+
+  partition.features
+
+In case of CIFAR10, you should see the following output.
+
+.. code-block:: none
+
+  {'img': Image(decode=True, id=None),
+  'label': ClassLabel(names=['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog',
+  'frog', 'horse', 'ship', 'truck'], id=None)}
+
+We will use the keys in the partition features in order to construct a `tf.data.Dataset <https://www.tensorflow.org/api_docs/python/tf/data/Dataset>_`. Let's move to the transformations.
+
 NumPy
 -----
 The first way is to transform the data into the NumPy arrays. It's an easier option that is commonly used. Feel free to
@@ -14,17 +36,7 @@ follow the :doc:`how-to-use-with-numpy` tutorial, especially if you are a beginn
 
 TensorFlow Dataset
 ------------------
-Work with ``TensorFlow Dataset`` abstraction.
-
-Standard setup::
-
-  from flwr_datasets import FederatedDataset
-
-  fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10})
-  partition = fds.load_partition(0, "train")
-  centralized_dataset = fds.load_full("test")
-
-Transformation to the TensorFlow Dataset::
+Transform the data to ``TensorFlow Dataset``::
 
   tf_dataset = partition.to_tf_dataset(columns="img", label_cols="label", batch_size=64,
                                      shuffle=True)
@@ -33,17 +45,7 @@ Transformation to the TensorFlow Dataset::
 
 TensorFlow Tensors
 ------------------
-Change the data type to TensorFlow Tensors (it's not the TensorFlow dataset).
-
-Standard setup::
-
-  from flwr_datasets import FederatedDataset
-
-  fds = FederatedDataset(dataset="cifar10", partitioners={"train": 10})
-  partition = fds.load_partition(0, "train")
-  centralized_dataset = fds.load_full("test")
-
-Transformation to the TensorFlow Tensors ::
+Transform the data to the TensorFlow `tf.Tensor <https://www.tensorflow.org/api_docs/python/tf/Tensor>`_ (it's not the TensorFlow dataset)::
 
   data_tf = partition.with_format("tf")
   # Assuming you have defined your model and compiled it

diff --git a/datasets/doc/source/index.rst b/datasets/doc/source/index.rst
@@ -38,11 +38,15 @@ References
 
 Information-oriented API reference and other reference material.
 
-.. toctree::
-   :maxdepth: 2
+.. autosummary::
+   :toctree: ref-api
+   :template: autosummary/module.rst
    :caption: API reference
+   :recursive:
+
+      flwr_datasets
+
 
-   ref-api-flwr-datasets
 
 Main features
 -------------

diff --git a/datasets/doc/source/ref-api-flwr-datasets.rst b/datasets/doc/source/ref-api-flwr-datasets.rst